p5_tumblr_v2.pl
(plain text)
#!/usr/bin/env perl
use Modern::Perl '2012';
use experimental 'switch', 'postderef';
use utf8;
# handle command line parameters
use Getopt::Long;
# retrieve network data
use HTTP::Tiny;
use JSON::Tiny 'decode_json';
# template
use Data::Section -setup;
use Template::Simple;
use Encode;
# transcode images into data urls
use MIME::Base64 'encode_base64';
use Mojo::DOM58;
# get command line options
my $url = '';
my $basedir = '.';
my $only_photo = 0;
GetOptions( "url=s" => \$url, "basedir=s" => \$basedir, "photo" => \$only_photo )
or die("Could not parse options.\n");
# make sure we have a url
if ( $url eq '' ) { die("You must provide a url.\n"); }
# normalize the basedir
if ( $basedir !~ m[/$] ) { $basedir .= '/'; }
# get the tumblr data and the common elements
my $tumblr_data = get_tumblr_data($url);
my $post_data = get_common_data( $tumblr_data->{'tumblelog'}, $tumblr_data->{'posts'}[ 0 ] );
# if we can, or want to, download only the photo, short circuit the rest of the program
if ( ( $post_data->{'_type'} eq 'photo' ) and ( $only_photo or should_download_only_photo($post_data) ) ) {
download_only_photo($post_data);
}
# specialize on the tumblr type
given ( $post_data->{'_type'} ) {
when ('regular') { add_regular_data($post_data); }
when ('answer') { add_answer_data($post_data); }
when ('quote') { add_quote_data($post_data); }
when ('photo') { add_photo_data($post_data); }
when ('video') { add_video_data($post_data); }
default {
die( "Unknown post type: " . $post_data->{'_type'} . "\n" );
}
}
# render the data
my $output = create_output_html($post_data);
# make all images data urls
convert_imgs($output);
# save the data
save_post($output);
# #################################################################################
# given a url, presume it is a tumblr-compliant url.
# get the data in json format and decode it
sub get_tumblr_data {
my ($url) = @_;
$url .= '?format=json'; # get data as json
my $data = HTTP::Tiny->new->get($url);
if ( !$data->{'success'} ) {
die( "Error retrieving post: " . $data->{'status'} . ' ' . $data->{'reason'} );
}
if ( $data->{'content'} !~ m/^var tumblr_api_read = / ) {
die("Error retrieving data: doesn't appear to be a tumblr.");
}
# tumblr returns Javascript, not really JSON
my $content = ( $data->{'content'} =~ s/^var tumblr_api_read = //r );
$content =~ s/;$//;
my $json = decode_json($content);
# make sure it looks right, in case we got some other sort of json
# or for some reason retrieved too many posts
if ( !defined( $json->{'tumblelog'} ) ) {
die("Malformed JSON data received.\n");
}
if ( !defined( $json->{'posts'} ) ) {
die("Not enough posts received.\n");
}
if ( scalar( $json->{'posts'}->@* ) != 1 ) {
die( "Too many posts received (" . scalar( $tumblr_data->{'posts'}->@* ) . ").\n" );
}
return $json;
}
# take the tumblr json and extract the elements we want to use
sub get_common_data {
my ( $t, $p ) = @_;
my $post = {
'_tumblr' => $t,
'_post' => $p,
'_type' => $p->{'type'},
'tumblr_key' => $t->{'name'} || 'unknown',
'tumblr_title' => $t->{'title'} || 'Unknown',
'post_id' => $p->{'id'},
'post_slug' => $p->{'slug'} || $p->{'type'},
'post_url' => $p->{'url-with-slug'} || $p->{'url'},
'post_date' => $p->{'date-gmt'},
'post_tags' => '',
'post_body' => '',
};
# fix tags
if ( scalar( $p->{'tags'}->@* ) ) {
$post->{'post_tags'} = { 'tags' => [ map { { 'tag' => $_ }; } $p->{'tags'}->@* ] };
}
return $post;
}
# given post data, create an "output" collection with the rendered HTML
sub create_output_html {
my ($post) = @_;
my $tmpl = Template::Simple->new();
my $template = ${ main->section_data('wrapper') };
my $rendered = ${ $tmpl->render( \$template, $post ) };
# Template::Simple tends to leave behind whitespace
$rendered =~ s/\n\t\n/\n/g;
$rendered =~ s/\n\n\n+/\n\n/g;
my $output = {
'type' => 'html',
'dir' => $basedir . $post->{'tumblr_key'},
'file' => $post->{'post_id'} . '--' . $post->{'post_slug'} . '.html',
'data' => $rendered,
};
# copy over information on downloads, if they exist
if ( $post->{'_download'} ) {
push $output->{'download'}->@*, $post->{'_download'}->@*;
}
return $output;
}
# go through the rendered HTML and find any img tags, and change
# them into data urls
sub convert_imgs {
my ($output) = @_;
my $ht = HTTP::Tiny->new();
my $_convert_imgs_helper = sub {
my ($url) = @_;
my $data = $ht->get($url);
if ( $data->{'success'} ) {
my $ct = $data->{'headers'}->{'content-type'};
return 'data:' . $ct . ';base64,' . encode_base64( $data->{'content'}, '' );
}
else {
return 'error: ' . $data->{'status'} . ' ' . $data->{'reason'};
}
};
$output->{'data'} =~ s/( <img [^>]* src=") ([^"]+) (" [^>]* > )/$1 . $_convert_imgs_helper->($2) . $3/gex;
}
# write the data out to the file system. if there are any downloads (ie, videos)
# download them as well
sub save_post {
my ($output) = @_;
if ( !-d $output->{'dir'} ) {
mkdir $output->{'dir'} or die("Could not create output directory '$output->{'dir'}': $!\n");
}
open( my $out, '>:encoding(utf-8)', $output->{'dir'} . '/' . $output->{'file'} )
or die("Could not create output file '$output->{'dir'}/$output->{'file'}': $!\n");
print $out $output->{'data'};
close($out) or die("Could not close output file '$output->{'dir'}/$output->{'file'}': $!\n");
if ( $output->{'download'} ) {
my $ht = HTTP::Tiny->new();
foreach my $dl ( $output->{'download'}->@* ) {
my $result = $ht->mirror( $dl->{'url'}, $output->{'dir'} . '/' . $dl->{'file'} );
if ( !$result->{'success'} ) {
warn("download failed for '$dl->{'url'}' to $dl->{'file'}: $result->{'status'} $result->{'reason'}\n");
}
}
}
}
# #################################################################################
# check to see if this post is sufficiently unencumbered that we can download just the
# photo data
sub should_download_only_photo {
my ($post_data) = @_;
my $p = $post_data->{'_post'};
if (1
and ( !defined( $p->{'photo-caption'} ) or ( $p->{'photo-caption'} eq '' ) ) # there is no caption
and ( !scalar( $p->{'photos'}->@* ) ) # there is only one photo
and ( !defined( $p->{'tags'} ) and !scalar( $p->{'tags'}->@* ) ) # there are no tags
)
{
return 1;
}
else {
return 0;
}
}
# download the photo directly to disk, skipping the whole HTML template stuff
sub download_only_photo {
my ($p) = @_;
# make sure there is an image to download
if ( !defined( $p->{'_post'}->{'photo-url-1280'} ) ) {
die("Could not find an image URL to download.\n");
}
# retrieve the image
my $ht = HTTP::Tiny->new();
my $data = $ht->get( $p->{'_post'}->{'photo-url-1280'} );
unless ( $data->{'success'} ) {
die( "Error retrieving photo only: " . $data->{'status'} . ' ' . $data->{'reason'} . "\n" );
}
# get the file name from the url
my $filename = ( $data->{'url'} =~ s{^.+/([^/]+)$}{$1}r );
# create an "output" object
my $output = {
'type' => 'img',
'dir' => $basedir . $p->{'tumblr_key'},
'file' => $p->{'post_id'} . '--' . $filename,
'data' => $data->{'content'},
};
save_post($output);
exit(0);
}
# #################################################################################
sub add_regular_data {
my ($post) = @_;
my $source = $post->{'_post'};
if ( defined( $source->{'regular-title'} ) and ( $source->{'regular-title'} ne '' ) ) {
$post->{'post_body'} = '<h2>' . $source->{'regular-title'} . '</h2>' . "\n\n";
}
$post->{'post_body'} .= $source->{'regular-body'};
}
sub add_answer_data {
my ($post) = @_;
my $source = $post->{'_post'};
$post->{'post_body'} = ''
. '<div class="answer-q">' . "\n" . $source->{'question'} . "\n" . '</div>' . "\n"
. '<div class="answer-a">' . "\n" . $source->{'answer'} . "\n" . '</div>';
}
sub add_quote_data {
my ($post) = @_;
my $source = $post->{'_post'};
$post->{'post_body'} = ''
. '<div class="quote-text">' . "\n" . $source->{'quote-text'} . "\n" . '</div>' . "\n"
. '<div class="quote-source">' . "\n" . $source->{'quote-source'} . "\n" . '</div>';
}
sub add_photo_data {
my ($post) = @_;
my $source = $post->{'_post'};
$post->{'post_photos'} = [];
# if there is only one photo, tumblr provides the photo data at the top level
# however, if there are multiple photos, it uses the photos array.
if ( !scalar( $source->{'photos'}->@* ) ) {
push $post->{'post_photos'}->@*, {
'caption' => $source->{'photo-caption'},
'url' => $source->{'photo-url-1280'},
};
}
else {
foreach my $ph ( $source->{'photos'}->@* ) {
push $post->{'post_photos'}->@*, {
'caption' => $ph->{'caption'},
'url' => $ph->{'photo-url-1280'},
};
}
}
}
sub add_video_data {
my ($post) = @_;
my $source = $post->{'_post'};
$post->{'post_videos'} = [];
# although this could be a false assumption, I assume that there is only one video
# and only one source
my $dom = Mojo::DOM58->new( $source->{'video-player'} );
my $video_wrapper = $dom->find('video[data-crt-options]')->[ 0 ];
my $video_source = $dom->find('video source')->[ 0 ];
if ( $video_wrapper and $video_source ) {
my $video_options = decode_json( $video_wrapper->{'data-crt-options'} );
if ( $video_options->{'hdUrl'} ) {
push $post->{'post_videos'}->@*, {
'url' => $video_options->{'hdUrl'},
'name' => $post->{'post_id'} . '.mp4',
'source' => $source->{'video-source'} . "\n" . $source->{'video-player'},
'caption' => $source->{'video-caption'},
};
}
elsif ( ( $video_source->{'type'} eq 'video/mp4' ) and ( $video_source->{'src'} =~ m/video_file/ ) ) {
push $post->{'post_videos'}->@*, {
'url' => $video_source->{'src'},
'name' => $post->{'post_id'} . '.mp4',
'source' => $source->{'video-source'} . "\n" . $source->{'video-player'},
'caption' => $source->{'video-caption'},
};
}
else {
warn( "Unsupported video type.\n" . $source->{'video-player'} . "\n" );
push $post->{'post_videos'}->@*, {
'url' => '',
'name' => $post->{'post_id'},
'source' => $source->{'video-source'} . "\n" . $source->{'video-player'},
'caption' => $source->{'video-caption'},
};
}
}
else {
warn( "Video data does not seem to make sense:\n" . $source->{'video-player'} . "\n" );
push $post->{'post_videos'}->@*, {
'url' => '',
'name' => $post->{'post_id'},
'source' => $source->{'video-source'} . "\n" . $source->{'video-player'},
'caption' => $source->{'video-caption'},
};
}
foreach my $v ( $post->{'post_videos'}->@* ) {
if ( $v->{'url'} ) {
push $post->{'_download'}->@*, {
'url' => $v->{'url'},
'file' => $v->{'name'},
};
}
$v->{'controller'} = '<video src="' . $v->{'name'} . '" width="" height="" controls preload allowfullscreen></video>';
}
}
__DATA__
__[ wrapper ]__
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="origin" content="[% post_url %]" />
<meta name="tumblr-type" content="[% _type %]" />
[% START post_tags %]<meta name="keywords" content="tumblr[% START tags %], [% tag %][% END tags %]" />[% END post_tags %]
<title>[% tumblr_title %] â [% post_id %] â [% post_slug %]</title>
<!-- #bbinclude "../tumblr.css" -->
<style type="text/css">
body {
margin: 0;
padding: 0;
font-family: "Hoefler Text", serif;
}
article {
margin: 0;
padding: 0;
padding: 0.5rem;
background: #EEEEEE;
border: 0.25rem solid #DDDDDD;
}
article header {
margin: 0;
margin-bottom: 1rem;
padding: 0;
border-bottom: thin solid #BBBBBB;
}
article header h1 {
margin: 0;
padding: 0;
margin-top: 1rem;
margin-bottom: 0.25rem;
font-size: 1.2rem;
}
article header p.date {
margin: 0;
padding: 0;
margin-top: 0.5rem;
margin-bottom: 0.5rem;
font-size: 0.8rem;
}
article header p.tags {
margin: 0;
padding: 0;
margin-top: 0.5rem;
margin-bottom: 0.5rem;
font-size: 0.8rem;
font-style: italic;
}
article header p.tags span.tag {
padding-right: 1rem;
}
article header p.tags span.tag:before {
content: '# ';
color: gray;
}
article figure img {
max-width: 100%;
}
article blockquote {
margin-left: 1rem;
border-left: thin dashed #CCCCCC;
padding-left: 0.5rem;
margin-right: 0;
}
video {
max-width: 100%;
}
</style>
<!-- end bbinclude -->
<style type="text/css">
</style>
</head>
<body>
<article>
<header>
<h1><a href="[% post_url %]">[% tumblr_title %] â [% post_id %] â [% post_slug %]</a></h1>
<p class="date">[% post_date %]</p>
[% START post_tags %]<p class="tags">[% START tags %]<span class="tag">[% tag %]</span>[% END tags %]</p>[% END post_tags %]
</header>
[% START post_photos %]
<figure>
<a href="[% url %]"><img src="[% url %]" /></a>
<figcaption>[% caption %]</figcaption>
</figure>
[% END post_photos %]
[% START post_videos %]
<!-- [% source %] -->
[% controller %]
[% caption %]
[% END post_videos %]
[% post_body %]
</article>
</body>
</html>