p5_tumblr_v2.pl

(plain text)


#!/usr/bin/env perl
use Modern::Perl '2012';
use experimental 'switch', 'postderef';
use utf8;

# handle command line parameters
use Getopt::Long;

# retrieve network data
use HTTP::Tiny;
use JSON::Tiny 'decode_json';

# template
use Data::Section -setup;
use Template::Simple;
use Encode;

# transcode images into data urls
use MIME::Base64 'encode_base64';
use Mojo::DOM58;

# get command line options
my $url        = '';
my $basedir    = '.';
my $only_photo = 0;
GetOptions( "url=s" => \$url, "basedir=s" => \$basedir, "photo" => \$only_photo )
  or die("Could not parse options.\n");

# make sure we have a url
if ( $url eq '' ) { die("You must provide a url.\n"); }

# normalize the basedir
if ( $basedir !~ m[/$] ) { $basedir .= '/'; }

# get the tumblr data and the common elements
my $tumblr_data = get_tumblr_data($url);
my $post_data = get_common_data( $tumblr_data->{'tumblelog'}, $tumblr_data->{'posts'}[ 0 ] );

# if we can, or want to, download only the photo, short circuit the rest of the program
if ( ( $post_data->{'_type'} eq 'photo' ) and ( $only_photo or should_download_only_photo($post_data) ) ) {
    download_only_photo($post_data);
}

# specialize on the tumblr type
given ( $post_data->{'_type'} ) {
    when ('regular') { add_regular_data($post_data); }
    when ('answer')  { add_answer_data($post_data); }
    when ('quote')   { add_quote_data($post_data); }
    when ('photo')   { add_photo_data($post_data); }
    when ('video')   { add_video_data($post_data); }

    default {
        die( "Unknown post type: " . $post_data->{'_type'} . "\n" );
    }
}

# render the data
my $output = create_output_html($post_data);

# make all images data urls
convert_imgs($output);

# save the data
save_post($output);

# #################################################################################

# given a url, presume it is a tumblr-compliant url.
# get the data in json format and decode it
sub get_tumblr_data {
    my ($url) = @_;
    $url .= '?format=json';    # get data as json

    my $data = HTTP::Tiny->new->get($url);
    if ( !$data->{'success'} ) {
        die( "Error retrieving post: " . $data->{'status'} . ' ' . $data->{'reason'} );
    }
    if ( $data->{'content'} !~ m/^var tumblr_api_read = / ) {
        die("Error retrieving data: doesn't appear to be a tumblr.");
    }

    # tumblr returns Javascript, not really JSON
    my $content = ( $data->{'content'} =~ s/^var tumblr_api_read = //r );
    $content =~ s/;$//;
    my $json = decode_json($content);

    # make sure it looks right, in case we got some other sort of json
    # or for some reason retrieved too many posts
    if ( !defined( $json->{'tumblelog'} ) ) {
        die("Malformed JSON data received.\n");
    }
    if ( !defined( $json->{'posts'} ) ) {
        die("Not enough posts received.\n");
    }
    if ( scalar( $json->{'posts'}->@* ) != 1 ) {
        die( "Too many posts received (" . scalar( $tumblr_data->{'posts'}->@* ) . ").\n" );
    }

    return $json;
}

# take the tumblr json and extract the elements we want to use
sub get_common_data {
    my ( $t, $p ) = @_;

    my $post = {
        '_tumblr' => $t,
        '_post'   => $p,
        '_type'   => $p->{'type'},

        'tumblr_key'   => $t->{'name'}  || 'unknown',
        'tumblr_title' => $t->{'title'} || 'Unknown',

        'post_id'   => $p->{'id'},
        'post_slug' => $p->{'slug'} || $p->{'type'},
        'post_url'  => $p->{'url-with-slug'} || $p->{'url'},
        'post_date' => $p->{'date-gmt'},
        'post_tags' => '',

        'post_body' => '',
    };

    # fix tags
    if ( scalar( $p->{'tags'}->@* ) ) {
        $post->{'post_tags'} = { 'tags' => [ map { { 'tag' => $_ }; } $p->{'tags'}->@* ] };
    }

    return $post;
}

# given post data, create an "output" collection with the rendered HTML
sub create_output_html {
    my ($post) = @_;

    my $tmpl     = Template::Simple->new();
    my $template = ${ main->section_data('wrapper') };
    my $rendered = ${ $tmpl->render( \$template, $post ) };

    # Template::Simple tends to leave behind whitespace
    $rendered =~ s/\n\t\n/\n/g;
    $rendered =~ s/\n\n\n+/\n\n/g;

    my $output = {
        'type' => 'html',
        'dir'  => $basedir . $post->{'tumblr_key'},
        'file' => $post->{'post_id'} . '--' . $post->{'post_slug'} . '.html',
        'data' => $rendered,
    };

    # copy over information on downloads, if they exist
    if ( $post->{'_download'} ) {
        push $output->{'download'}->@*, $post->{'_download'}->@*;
    }

    return $output;
}

# go through the rendered HTML and find any img tags, and change
# them into data urls
sub convert_imgs {
    my ($output) = @_;

    my $ht = HTTP::Tiny->new();

    my $_convert_imgs_helper = sub {
        my ($url) = @_;
        my $data = $ht->get($url);
        if ( $data->{'success'} ) {
            my $ct = $data->{'headers'}->{'content-type'};
            return 'data:' . $ct . ';base64,' . encode_base64( $data->{'content'}, '' );
        }
        else {
            return 'error: ' . $data->{'status'} . ' ' . $data->{'reason'};
        }
    };

    $output->{'data'} =~ s/( <img [^>]* src=") ([^"]+) (" [^>]* > )/$1 . $_convert_imgs_helper->($2) . $3/gex;
}

# write the data out to the file system. if there are any downloads (ie, videos)
# download them as well
sub save_post {
    my ($output) = @_;

    if ( !-d $output->{'dir'} ) {
        mkdir $output->{'dir'} or die("Could not create output directory '$output->{'dir'}': $!\n");
    }
    open( my $out, '>:encoding(utf-8)', $output->{'dir'} . '/' . $output->{'file'} )
      or die("Could not create output file '$output->{'dir'}/$output->{'file'}': $!\n");
    print $out $output->{'data'};
    close($out) or die("Could not close output file '$output->{'dir'}/$output->{'file'}': $!\n");

    if ( $output->{'download'} ) {
        my $ht = HTTP::Tiny->new();
        foreach my $dl ( $output->{'download'}->@* ) {
            my $result = $ht->mirror( $dl->{'url'}, $output->{'dir'} . '/' . $dl->{'file'} );
            if ( !$result->{'success'} ) {
                warn("download failed for '$dl->{'url'}' to $dl->{'file'}: $result->{'status'} $result->{'reason'}\n");
            }
        }
    }
}

# #################################################################################

# check to see if this post is sufficiently unencumbered that we can download just the
# photo data
sub should_download_only_photo {
    my ($post_data) = @_;
    my $p = $post_data->{'_post'};
    if (1
        and ( !defined( $p->{'photo-caption'} ) or ( $p->{'photo-caption'} eq '' ) )    # there is no caption
        and ( !scalar( $p->{'photos'}->@* ) )                                           # there is only one photo
        and ( !defined( $p->{'tags'} ) and !scalar( $p->{'tags'}->@* ) )                # there are no tags
      )
    {
        return 1;
    }
    else {
        return 0;
    }
}

# download the photo directly to disk, skipping the whole HTML template stuff
sub download_only_photo {
    my ($p) = @_;

    # make sure there is an image to download
    if ( !defined( $p->{'_post'}->{'photo-url-1280'} ) ) {
        die("Could not find an image URL to download.\n");
    }

    # retrieve the image
    my $ht   = HTTP::Tiny->new();
    my $data = $ht->get( $p->{'_post'}->{'photo-url-1280'} );
    unless ( $data->{'success'} ) {
        die( "Error retrieving photo only: " . $data->{'status'} . ' ' . $data->{'reason'} . "\n" );
    }

    # get the file name from the url
    my $filename = ( $data->{'url'} =~ s{^.+/([^/]+)$}{$1}r );

    # create an "output" object
    my $output = {
        'type' => 'img',
        'dir'  => $basedir . $p->{'tumblr_key'},
        'file' => $p->{'post_id'} . '--' . $filename,
        'data' => $data->{'content'},
    };

    save_post($output);
    exit(0);
}

# #################################################################################

sub add_regular_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    if ( defined( $source->{'regular-title'} ) and ( $source->{'regular-title'} ne '' ) ) {
        $post->{'post_body'} = '<h2>' . $source->{'regular-title'} . '</h2>' . "\n\n";
    }

    $post->{'post_body'} .= $source->{'regular-body'};
}

sub add_answer_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_body'} = ''
      . '<div class="answer-q">' . "\n" . $source->{'question'} . "\n" . '</div>' . "\n"
      . '<div class="answer-a">' . "\n" . $source->{'answer'} . "\n" . '</div>';
}

sub add_quote_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_body'} = ''
      . '<div class="quote-text">' . "\n" . $source->{'quote-text'} . "\n" . '</div>' . "\n"
      . '<div class="quote-source">' . "\n" . $source->{'quote-source'} . "\n" . '</div>';
}

sub add_photo_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_photos'} = [];

    # if there is only one photo, tumblr provides the photo data at the top level
    # however, if there are multiple photos, it uses the photos array.
    if ( !scalar( $source->{'photos'}->@* ) ) {
        push $post->{'post_photos'}->@*, {
            'caption' => $source->{'photo-caption'},
            'url'     => $source->{'photo-url-1280'},
        };
    }
    else {
        foreach my $ph ( $source->{'photos'}->@* ) {
            push $post->{'post_photos'}->@*, {
                'caption' => $ph->{'caption'},
                'url'     => $ph->{'photo-url-1280'},
            };
        }
    }
}

sub add_video_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_videos'} = [];

    # although this could be a false assumption, I assume that there is only one video
    # and only one source
    my $dom           = Mojo::DOM58->new( $source->{'video-player'} );
    my $video_wrapper = $dom->find('video[data-crt-options]')->[ 0 ];
    my $video_source  = $dom->find('video source')->[ 0 ];

    if ( $video_wrapper and $video_source ) {
        my $video_options = decode_json( $video_wrapper->{'data-crt-options'} );
        if ( $video_options->{'hdUrl'} ) {
            push $post->{'post_videos'}->@*, {
                'url'     => $video_options->{'hdUrl'},
                'name'    => $post->{'post_id'} . '.mp4',
                'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
                'caption' => $source->{'video-caption'},
            };
        }
        elsif ( ( $video_source->{'type'} eq 'video/mp4' ) and ( $video_source->{'src'} =~ m/video_file/ ) ) {
            push $post->{'post_videos'}->@*, {
                'url'     => $video_source->{'src'},
                'name'    => $post->{'post_id'} . '.mp4',
                'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
                'caption' => $source->{'video-caption'},
            };
        }
        else {
            warn( "Unsupported video type.\n" . $source->{'video-player'} . "\n" );
            push $post->{'post_videos'}->@*, {
                'url'     => '',
                'name'    => $post->{'post_id'},
                'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
                'caption' => $source->{'video-caption'},
            };
        }
    }
    else {
        warn( "Video data does not seem to make sense:\n" . $source->{'video-player'} . "\n" );
        push $post->{'post_videos'}->@*, {
            'url'     => '',
            'name'    => $post->{'post_id'},
            'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
            'caption' => $source->{'video-caption'},
        };
    }

    foreach my $v ( $post->{'post_videos'}->@* ) {
        if ( $v->{'url'} ) {
            push $post->{'_download'}->@*, {
                'url'  => $v->{'url'},
                'file' => $v->{'name'},
            };
        }
        $v->{'controller'} = '<video src="' . $v->{'name'} . '" width="" height="" controls preload allowfullscreen></video>';
    }
}

__DATA__

__[ wrapper ]__
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<meta name="origin" content="[% post_url %]" />
	<meta name="tumblr-type" content="[% _type %]" />
	[% START post_tags %]<meta name="keywords" content="tumblr[% START tags %], [% tag %][% END tags %]" />[% END post_tags %]
	<title>[% tumblr_title %] — [% post_id %] — [% post_slug %]</title>
<!-- #bbinclude "../tumblr.css" -->
	<style type="text/css">
		body {
			margin: 0;
			padding: 0;
			font-family: "Hoefler Text", serif;
		}
		article {
			margin: 0;
			padding: 0;
			padding: 0.5rem;
			background: #EEEEEE;
			border: 0.25rem solid #DDDDDD;
		}
		article header {
			margin: 0;
			margin-bottom: 1rem;
			padding: 0;
			border-bottom: thin solid #BBBBBB;
		}
		article header h1 {
			margin: 0;
			padding: 0;
			margin-top: 1rem;
			margin-bottom: 0.25rem;
			font-size: 1.2rem;
		}
		article header p.date {
			margin: 0;
			padding: 0;
			margin-top: 0.5rem;
			margin-bottom: 0.5rem;
			font-size: 0.8rem;
		}
		article header p.tags {
			margin: 0;
			padding: 0;
			margin-top: 0.5rem;
			margin-bottom: 0.5rem;
			font-size: 0.8rem;
			font-style: italic;
		}
		article header p.tags span.tag {
			padding-right: 1rem;
		}
		article header p.tags span.tag:before {
			content: '# ';
			color: gray;
		}
		article figure img {
			max-width: 100%;
		}
		article blockquote {
			margin-left: 1rem;
			border-left: thin dashed #CCCCCC;
			padding-left: 0.5rem;
			margin-right: 0;
		}
		video {
			max-width: 100%;
		}
	</style>
<!-- end bbinclude -->
	<style type="text/css">
	</style>
</head>
<body>
<article>
<header>
	<h1><a href="[% post_url %]">[% tumblr_title %] — [% post_id %] — [% post_slug %]</a></h1>
	<p class="date">[% post_date %]</p>
	[% START post_tags %]<p class="tags">[% START tags %]<span class="tag">[% tag %]</span>[% END tags %]</p>[% END post_tags %]
</header>
[% START post_photos %]
<figure>
	<a href="[% url %]"><img src="[% url %]" /></a>
	<figcaption>[% caption %]</figcaption>
</figure>
[% END post_photos %]
[% START post_videos %]
<!-- [% source %] -->
[% controller %]
[% caption %]
[% END post_videos %]
[% post_body %]
</article>
</body>
</html>