#!/Users/pukku/perl5/perlbrew/perls/current/bin/perl use Modern::Perl '2012'; use experimental 'switch'; # because the perl maintainers are annoying use Carp; use Data::Dump qw/pp/; use Getopt::Long; use LWP::Simple; # for the initial get of data use LWP::UserAgent; # for retrieving the images use JSON; use Encode; use Web::Query; use Data::Section -setup; use MIME::Base64 qw/encode_base64/; use Template::Simple; my $basedir = '/Users/pukku/Pictures/tumblr/'; # keep photos in subdirectories of here my $url = ''; my $only_photo = 0; GetOptions("url=s" => \$url, "photo" => \$only_photo, "basedir=s" => \$basedir ) or die("Could not parse options.\n"); my $tumblr_data = get_tumblr_data($url); my $tumblelog = $tumblr_data->{'tumblelog'}; if (scalar(@{$tumblr_data->{'posts'}}) != 1) { die("Too many posts retrieved.\n"); } my $post = $tumblr_data->{'posts'}[0]; my $type = $post->{"type"}; # create a general holder my $post_data = get_common_data($tumblelog, $post); my $output; # needed to be predefined in case we GOTO # special case for only downloading the photo # @TODO add this for video? if ( ($type eq 'photo') and ($only_photo or download_only_photo($post))) { my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); my $resp = $ua->get($post->{'photo-url-1280'}); if ($resp->is_success) { $output = { 'dir' => $basedir . $tumblelog->{'name'}, 'file' => $post->{'id'} . '--' . $resp->filename(), 'data' => $resp->content(), }; goto CREATE_OUTPUT; } else { die("Could not download just the photo: " . $resp->status_line . "\n"); } } # call handlers unique to each data type given($type) { when ('photo') { add_photo_data($post_data); } when ('regular') { add_regular_data($post_data); } when ('answer') { add_answer_data($post_data); } when ('video') { add_video_data($post_data); } when ('quote') { add_quote_data($post_data); } default { die("Unknown post type: $type\n"); } } $output = create_output_data($post_data); # create the output file CREATE_OUTPUT: { mkdir $output->{'dir'}; open(my $out, '>', $output->{'dir'} . '/' . $output->{'file'}) or die("Could not create output file $output->{'dir'}/$output->{'file'}: $!\n"); print $out $output->{'data'}; close($out); if ($output->{'download'}) { foreach my $dl (@{$output->{'download'}}) { my $res = getstore($dl->{'url'}, $output->{'dir'} . '/' . $dl->{'name'}); if (!is_success($res)) { warn("video download failed: $res\n"); } } } } ################# # retrieve the data from the server sub get_tumblr_data { my ($url) = @_; $url .= '?format=json'; my $data = get($url); if (!defined($data) or ($data eq '')) { croak("no data\n"); } if ($data !~ m/^var tumblr_api_read/) { croak("not a tumblr\n"); } $data =~ s/^var tumblr_api_read = //; $data =~ s/;$//; return decode_json($data); } # pull out the data common to all types sub get_common_data { my ($t, $p) = @_; my $post = { '_tumblelog' => $t, '_post' => $p, 'tumblr_key' => $t->{'name'}, 'tumblr_title' => $t->{'title'}, 'post_id' => $p->{'id'}, 'post_slug' => $p->{'slug'} || $p->{'type'}, 'post_url' => $p->{'url-with-slug'}, 'post_date' => $p->{'date-gmt'}, }; # get any tags if (defined($p->{'tags'}) and scalar(@{$p->{'tags'}})) { $post->{'tags'}{'tgs'} = []; foreach my $tag (@{$p->{'tags'}}) { push @{$post->{'tags'}{'tgs'}}, { 'tag' => $tag }; } } else { $post->{'tags'} = ''; } return $post; } # handle the photo type sub add_photo_data { my ($post) = @_; my $p = $post->{'_post'}; $post->{'post_body'} = $p->{'photo-caption'} || ''; $post->{'photos'} = []; # if there is only one photo, tumblr puts the photo data at the top level. # if there is more than one, it is in a sub-array. this is inconsistant. # so we fix things so that the data is consistant. if (!scalar(@{$p->{'photos'}})) { push @{$post->{'photos'}}, { 'photo_caption' => '', 'photo_url' => $p->{'photo-url-1280'}, }; } else { foreach my $ph (@{$p->{'photos'}}) { push @{$post->{'photos'}}, { 'photo_caption' => $ph->{'caption'}, 'photo_url' => $ph->{'photo-url-1280'}, }; } } # go through and download each photo my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); # ideally we would verify, but somethings broken right now foreach my $i (@{$post->{'photos'}}) { my $data = download_encoded_image($i->{'photo_url'}); if (substr($data, 0, 4) eq 'data') { $i->{'photo_data'} = $data; } else { $i->{'photo_data'} = ''; $i->{'photo_caption'} = '

' . $data . '

' . $i->{'photo_caption'}; } } } sub add_regular_data { my ($post) = @_; my $p = $post->{'_post'}; $post->{'post_body'} = $p->{'regular-body'}; if (defined($p->{'regular-title'}) and ($p->{'regular-title'} ne '')) { $post->{'post_body'} = '

' . $p->{'regular-title'} . '

' . "\n\n" . $post->{'post_body'}; } } sub add_answer_data { my ($post) = @_; my $p = $post->{'_post'}; $post->{'post_body'} = '' . '
' . "\n" . $p->{'question'} . "\n" . '
' . "\n" . '
' . "\n" . $p->{'answer'} . "\n" . '
'; } sub add_video_data { my ($post) = @_; my $p = $post->{'_post'}; $post->{'post_body'} = $p->{'video-caption'} || ''; $post->{'videos'} = []; my $suffix = ''; my $pwq = wq('
' . $p->{'video-player'} . '
'); my $vidtag = $pwq->find('video[data-crt-options]'); if ($vidtag->size()) { my $vidopts = decode_json($vidtag->attr('data-crt-options')); if ($vidopts->{'hdUrl'}) { $suffix = '.mp4'; push @{$post->{'_videodownload'}}, { 'url' => $vidopts->{'hdUrl'}, 'name' => $p->{'id'} . $suffix }; } else { my $source = $vidtag->find('source'); if ($source->size()) { my $vurl = $source->attr('src'); my $vtype = $source->attr('type'); if (($vtype eq 'video/mp4') and ($vurl =~ m/video_file/)) { $suffix = '.mp4'; push @{$post->{'_videodownload'}}, { 'url' => $vurl, name => $p->{'id'} . $suffix }; } } } } if (!$post->{'_videodownload'}) { warn("video type not supported.\n"); warn(pp($p->{'video-player'})); } push @{$post->{'videos'}}, { 'source' => $p->{'video-source'} . "\n" . $p->{'video-player'}, 'controller' => '', }; } sub add_quote_data { my ($post) = @_; my $p = $post->{'_post'}; $post->{'post_body'} = '' . '
' . "\n" . $p->{'quote-text'} . "\n" . '
' . "\n" . '
' . "\n" . $p->{'quote-source'} . "\n" . '
'; } # wrap up the logic to create the output data sub create_output_data { my ($post) = @_; my $tmpl = Template::Simple->new(); my $template = ${ main->section_data('wrapper') }; my $rendered = encode('utf-8', ${ $tmpl->render( \$template, $post ) }); my $output = { 'dir' => $basedir . $post->{'tumblr_key'}, 'file' => $post->{'post_id'} . '--' . $post->{'post_slug'} . '.html', 'data' => $rendered, }; if ($post->{'_videodownload'}) { push @{$output->{'download'}}, @{$post->{'_videodownload'}}; } return $output; } # download an image file and turn it into base64 encoded inline data # if everything works, the first characters of the returned string will be 'data' # if an error occurs, the first characters of the returned string will be 'error' sub download_encoded_image { my ($url) = @_; my $ua = LWP::UserAgent->new(ssl_opts => { verify_hostname => 0 }); # ideally we would verify my $resp = $ua->get($url); if ($resp->is_success) { my $content_type = $resp->header('Content-Type'); my $data = encode_base64($resp->content(), ''); return 'data:' . $content_type . ';base64,' . $data; } else { return 'error: ' . $resp->status_line(); } } # test to see if there is anything besides the photo to be downloaded # if not, we can avoid the overhead of base64ing the data sub download_only_photo { my ($p) = @_; if ( (!defined($p->{'photo-caption'}) or ($p->{'photo-caption'} eq '')) # there is no caption and (!scalar(@{$p->{'photos'}})) # there is only one photo and (!defined($post->{'tags'}) or !scalar(@{$p->{'tags'}})) # there are no tags ) { return 1; } else { return 0; } } __DATA__ __[ wrapper ]__ [% tumblr_title %] — [% post_id %] — [% post_slug %]

[% tumblr_title %] — [% post_id %] — [% post_slug %]

[% post_date %]

[% START tags %]

[% START tgs %]#[% tag %][% END tgs %]

[% END tags %]
[% START photos %]
[% photo_caption %]
[% END photos %] [% START videos %] [% controller %] [% END videos %] [% post_body %]