#!/usr/bin/env perl use Modern::Perl '2012'; use experimental 'switch', 'postderef'; use utf8; # handle command line parameters use Getopt::Long; # retrieve network data use HTTP::Tiny; use JSON::Tiny 'decode_json'; # template use Data::Section -setup; use Template::Simple; use Encode; # transcode images into data urls use MIME::Base64 'encode_base64'; use Mojo::DOM58; # get command line options my $url = ''; my $basedir = '.'; my $only_photo = 0; GetOptions( "url=s" => \$url, "basedir=s" => \$basedir, "photo" => \$only_photo ) or die("Could not parse options.\n"); # make sure we have a url if ( $url eq '' ) { die("You must provide a url.\n"); } # normalize the basedir if ( $basedir !~ m[/$] ) { $basedir .= '/'; } # get the tumblr data and the common elements my $tumblr_data = get_tumblr_data($url); my $post_data = get_common_data( $tumblr_data->{'tumblelog'}, $tumblr_data->{'posts'}[ 0 ] ); # if we can, or want to, download only the photo, short circuit the rest of the program if ( ( $post_data->{'_type'} eq 'photo' ) and ( $only_photo or should_download_only_photo($post_data) ) ) { download_only_photo($post_data); } # specialize on the tumblr type given ( $post_data->{'_type'} ) { when ('regular') { add_regular_data($post_data); } when ('answer') { add_answer_data($post_data); } when ('quote') { add_quote_data($post_data); } when ('photo') { add_photo_data($post_data); } when ('video') { add_video_data($post_data); } default { die( "Unknown post type: " . $post_data->{'_type'} . "\n" ); } } # render the data my $output = create_output_html($post_data); # make all images data urls convert_imgs($output); # save the data save_post($output); # ################################################################################# # given a url, presume it is a tumblr-compliant url. # get the data in json format and decode it sub get_tumblr_data { my ($url) = @_; $url .= '?format=json'; # get data as json my $data = HTTP::Tiny->new->get($url); if ( !$data->{'success'} ) { die( "Error retrieving post: " . $data->{'status'} . ' ' . $data->{'reason'} ); } if ( $data->{'content'} !~ m/^var tumblr_api_read = / ) { die("Error retrieving data: doesn't appear to be a tumblr."); } # tumblr returns Javascript, not really JSON my $content = ( $data->{'content'} =~ s/^var tumblr_api_read = //r ); $content =~ s/;$//; my $json = decode_json($content); # make sure it looks right, in case we got some other sort of json # or for some reason retrieved too many posts if ( !defined( $json->{'tumblelog'} ) ) { die("Malformed JSON data received.\n"); } if ( !defined( $json->{'posts'} ) ) { die("Not enough posts received.\n"); } if ( scalar( $json->{'posts'}->@* ) != 1 ) { die( "Too many posts received (" . scalar( $tumblr_data->{'posts'}->@* ) . ").\n" ); } return $json; } # take the tumblr json and extract the elements we want to use sub get_common_data { my ( $t, $p ) = @_; my $post = { '_tumblr' => $t, '_post' => $p, '_type' => $p->{'type'}, 'tumblr_key' => $t->{'name'} || 'unknown', 'tumblr_title' => $t->{'title'} || 'Unknown', 'post_id' => $p->{'id'}, 'post_slug' => $p->{'slug'} || $p->{'type'}, 'post_url' => $p->{'url-with-slug'} || $p->{'url'}, 'post_date' => $p->{'date-gmt'}, 'post_tags' => '', 'post_body' => '', }; # fix tags if ( scalar( $p->{'tags'}->@* ) ) { $post->{'post_tags'} = { 'tags' => [ map { { 'tag' => $_ }; } $p->{'tags'}->@* ] }; } return $post; } # given post data, create an "output" collection with the rendered HTML sub create_output_html { my ($post) = @_; my $tmpl = Template::Simple->new(); my $template = ${ main->section_data('wrapper') }; my $rendered = ${ $tmpl->render( \$template, $post ) }; # Template::Simple tends to leave behind whitespace $rendered =~ s/\n\t\n/\n/g; $rendered =~ s/\n\n\n+/\n\n/g; my $output = { 'type' => 'html', 'dir' => $basedir . $post->{'tumblr_key'}, 'file' => $post->{'post_id'} . '--' . $post->{'post_slug'} . '.html', 'data' => $rendered, }; # copy over information on downloads, if they exist if ( $post->{'_download'} ) { push $output->{'download'}->@*, $post->{'_download'}->@*; } return $output; } # go through the rendered HTML and find any img tags, and change # them into data urls sub convert_imgs { my ($output) = @_; my $ht = HTTP::Tiny->new(); my $_convert_imgs_helper = sub { my ($url) = @_; my $data = $ht->get($url); if ( $data->{'success'} ) { my $ct = $data->{'headers'}->{'content-type'}; return 'data:' . $ct . ';base64,' . encode_base64( $data->{'content'}, '' ); } else { return 'error: ' . $data->{'status'} . ' ' . $data->{'reason'}; } }; $output->{'data'} =~ s/( ]* src=") ([^"]+) (" [^>]* > )/$1 . $_convert_imgs_helper->($2) . $3/gex; } # write the data out to the file system. if there are any downloads (ie, videos) # download them as well sub save_post { my ($output) = @_; if ( !-d $output->{'dir'} ) { mkdir $output->{'dir'} or die("Could not create output directory '$output->{'dir'}': $!\n"); } open( my $out, '>:encoding(utf-8)', $output->{'dir'} . '/' . $output->{'file'} ) or die("Could not create output file '$output->{'dir'}/$output->{'file'}': $!\n"); print $out $output->{'data'}; close($out) or die("Could not close output file '$output->{'dir'}/$output->{'file'}': $!\n"); if ( $output->{'download'} ) { my $ht = HTTP::Tiny->new(); foreach my $dl ( $output->{'download'}->@* ) { my $result = $ht->mirror( $dl->{'url'}, $output->{'dir'} . '/' . $dl->{'file'} ); if ( !$result->{'success'} ) { warn("download failed for '$dl->{'url'}' to $dl->{'file'}: $result->{'status'} $result->{'reason'}\n"); } } } } # ################################################################################# # check to see if this post is sufficiently unencumbered that we can download just the # photo data sub should_download_only_photo { my ($post_data) = @_; my $p = $post_data->{'_post'}; if (1 and ( !defined( $p->{'photo-caption'} ) or ( $p->{'photo-caption'} eq '' ) ) # there is no caption and ( !scalar( $p->{'photos'}->@* ) ) # there is only one photo and ( !defined( $p->{'tags'} ) and !scalar( $p->{'tags'}->@* ) ) # there are no tags ) { return 1; } else { return 0; } } # download the photo directly to disk, skipping the whole HTML template stuff sub download_only_photo { my ($p) = @_; # make sure there is an image to download if ( !defined( $p->{'_post'}->{'photo-url-1280'} ) ) { die("Could not find an image URL to download.\n"); } # retrieve the image my $ht = HTTP::Tiny->new(); my $data = $ht->get( $p->{'_post'}->{'photo-url-1280'} ); unless ( $data->{'success'} ) { die( "Error retrieving photo only: " . $data->{'status'} . ' ' . $data->{'reason'} . "\n" ); } # get the file name from the url my $filename = ( $data->{'url'} =~ s{^.+/([^/]+)$}{$1}r ); # create an "output" object my $output = { 'type' => 'img', 'dir' => $basedir . $p->{'tumblr_key'}, 'file' => $p->{'post_id'} . '--' . $filename, 'data' => $data->{'content'}, }; save_post($output); exit(0); } # ################################################################################# sub add_regular_data { my ($post) = @_; my $source = $post->{'_post'}; if ( defined( $source->{'regular-title'} ) and ( $source->{'regular-title'} ne '' ) ) { $post->{'post_body'} = '

' . $source->{'regular-title'} . '

' . "\n\n"; } $post->{'post_body'} .= $source->{'regular-body'}; } sub add_answer_data { my ($post) = @_; my $source = $post->{'_post'}; $post->{'post_body'} = '' . '
' . "\n" . $source->{'question'} . "\n" . '
' . "\n" . '
' . "\n" . $source->{'answer'} . "\n" . '
'; } sub add_quote_data { my ($post) = @_; my $source = $post->{'_post'}; $post->{'post_body'} = '' . '
' . "\n" . $source->{'quote-text'} . "\n" . '
' . "\n" . '
' . "\n" . $source->{'quote-source'} . "\n" . '
'; } sub add_photo_data { my ($post) = @_; my $source = $post->{'_post'}; $post->{'post_photos'} = []; # if there is only one photo, tumblr provides the photo data at the top level # however, if there are multiple photos, it uses the photos array. if ( !scalar( $source->{'photos'}->@* ) ) { push $post->{'post_photos'}->@*, { 'caption' => $source->{'photo-caption'}, 'url' => $source->{'photo-url-1280'}, }; } else { foreach my $ph ( $source->{'photos'}->@* ) { push $post->{'post_photos'}->@*, { 'caption' => $ph->{'caption'}, 'url' => $ph->{'photo-url-1280'}, }; } } } sub add_video_data { my ($post) = @_; my $source = $post->{'_post'}; $post->{'post_videos'} = []; # although this could be a false assumption, I assume that there is only one video # and only one source my $dom = Mojo::DOM58->new( $source->{'video-player'} ); my $video_wrapper = $dom->find('video[data-crt-options]')->[ 0 ]; my $video_source = $dom->find('video source')->[ 0 ]; if ( $video_wrapper and $video_source ) { my $video_options = decode_json( $video_wrapper->{'data-crt-options'} ); if ( $video_options->{'hdUrl'} ) { push $post->{'post_videos'}->@*, { 'url' => $video_options->{'hdUrl'}, 'name' => $post->{'post_id'} . '.mp4', 'source' => $source->{'video-source'} . "\n" . $source->{'video-player'}, 'caption' => $source->{'video-caption'}, }; } elsif ( ( $video_source->{'type'} eq 'video/mp4' ) and ( $video_source->{'src'} =~ m/video_file/ ) ) { push $post->{'post_videos'}->@*, { 'url' => $video_source->{'src'}, 'name' => $post->{'post_id'} . '.mp4', 'source' => $source->{'video-source'} . "\n" . $source->{'video-player'}, 'caption' => $source->{'video-caption'}, }; } else { warn( "Unsupported video type.\n" . $source->{'video-player'} . "\n" ); push $post->{'post_videos'}->@*, { 'url' => '', 'name' => $post->{'post_id'}, 'source' => $source->{'video-source'} . "\n" . $source->{'video-player'}, 'caption' => $source->{'video-caption'}, }; } } else { warn( "Video data does not seem to make sense:\n" . $source->{'video-player'} . "\n" ); push $post->{'post_videos'}->@*, { 'url' => '', 'name' => $post->{'post_id'}, 'source' => $source->{'video-source'} . "\n" . $source->{'video-player'}, 'caption' => $source->{'video-caption'}, }; } foreach my $v ( $post->{'post_videos'}->@* ) { if ( $v->{'url'} ) { push $post->{'_download'}->@*, { 'url' => $v->{'url'}, 'file' => $v->{'name'}, }; } $v->{'controller'} = ''; } } __DATA__ __[ wrapper ]__ [% START post_tags %][% END post_tags %] [% tumblr_title %] — [% post_id %] — [% post_slug %]

[% tumblr_title %] — [% post_id %] — [% post_slug %]

[% post_date %]

[% START post_tags %]

[% START tags %][% tag %][% END tags %]

[% END post_tags %]
[% START post_photos %]
[% caption %]
[% END post_photos %] [% START post_videos %] [% controller %] [% caption %] [% END post_videos %] [% post_body %]