[% tumblr_title %] — [% post_id %] — [% post_slug %]
[% post_date %]
[% START post_tags %] [% END post_tags %]#!/usr/bin/env perl
use Modern::Perl '2012';
use experimental 'switch', 'postderef';
use utf8;
# handle command line parameters
use Getopt::Long;
# retrieve network data
use HTTP::Tiny;
use JSON::Tiny 'decode_json';
# template
use Data::Section -setup;
use Template::Simple;
use Encode;
# transcode images into data urls
use MIME::Base64 'encode_base64';
use Mojo::DOM58;
# get command line options
my $url = '';
my $basedir = '.';
my $only_photo = 0;
GetOptions( "url=s" => \$url, "basedir=s" => \$basedir, "photo" => \$only_photo )
or die("Could not parse options.\n");
# make sure we have a url
if ( $url eq '' ) { die("You must provide a url.\n"); }
# normalize the basedir
if ( $basedir !~ m[/$] ) { $basedir .= '/'; }
# get the tumblr data and the common elements
my $tumblr_data = get_tumblr_data($url);
my $post_data = get_common_data( $tumblr_data->{'tumblelog'}, $tumblr_data->{'posts'}[ 0 ] );
# if we can, or want to, download only the photo, short circuit the rest of the program
if ( ( $post_data->{'_type'} eq 'photo' ) and ( $only_photo or should_download_only_photo($post_data) ) ) {
download_only_photo($post_data);
}
# specialize on the tumblr type
given ( $post_data->{'_type'} ) {
when ('regular') { add_regular_data($post_data); }
when ('answer') { add_answer_data($post_data); }
when ('quote') { add_quote_data($post_data); }
when ('photo') { add_photo_data($post_data); }
when ('video') { add_video_data($post_data); }
default {
die( "Unknown post type: " . $post_data->{'_type'} . "\n" );
}
}
# render the data
my $output = create_output_html($post_data);
# make all images data urls
convert_imgs($output);
# save the data
save_post($output);
# #################################################################################
# given a url, presume it is a tumblr-compliant url.
# get the data in json format and decode it
sub get_tumblr_data {
my ($url) = @_;
$url .= '?format=json'; # get data as json
my $data = HTTP::Tiny->new->get($url);
if ( !$data->{'success'} ) {
die( "Error retrieving post: " . $data->{'status'} . ' ' . $data->{'reason'} );
}
if ( $data->{'content'} !~ m/^var tumblr_api_read = / ) {
die("Error retrieving data: doesn't appear to be a tumblr.");
}
# tumblr returns Javascript, not really JSON
my $content = ( $data->{'content'} =~ s/^var tumblr_api_read = //r );
$content =~ s/;$//;
my $json = decode_json($content);
# make sure it looks right, in case we got some other sort of json
# or for some reason retrieved too many posts
if ( !defined( $json->{'tumblelog'} ) ) {
die("Malformed JSON data received.\n");
}
if ( !defined( $json->{'posts'} ) ) {
die("Not enough posts received.\n");
}
if ( scalar( $json->{'posts'}->@* ) != 1 ) {
die( "Too many posts received (" . scalar( $tumblr_data->{'posts'}->@* ) . ").\n" );
}
return $json;
}
# take the tumblr json and extract the elements we want to use
sub get_common_data {
my ( $t, $p ) = @_;
my $post = {
'_tumblr' => $t,
'_post' => $p,
'_type' => $p->{'type'},
'tumblr_key' => $t->{'name'} || 'unknown',
'tumblr_title' => $t->{'title'} || 'Unknown',
'post_id' => $p->{'id'},
'post_slug' => $p->{'slug'} || $p->{'type'},
'post_url' => $p->{'url-with-slug'} || $p->{'url'},
'post_date' => $p->{'date-gmt'},
'post_tags' => '',
'post_body' => '',
};
# fix tags
if ( scalar( $p->{'tags'}->@* ) ) {
$post->{'post_tags'} = { 'tags' => [ map { { 'tag' => $_ }; } $p->{'tags'}->@* ] };
}
return $post;
}
# given post data, create an "output" collection with the rendered HTML
sub create_output_html {
my ($post) = @_;
my $tmpl = Template::Simple->new();
my $template = ${ main->section_data('wrapper') };
my $rendered = ${ $tmpl->render( \$template, $post ) };
# Template::Simple tends to leave behind whitespace
$rendered =~ s/\n\t\n/\n/g;
$rendered =~ s/\n\n\n+/\n\n/g;
my $output = {
'type' => 'html',
'dir' => $basedir . $post->{'tumblr_key'},
'file' => $post->{'post_id'} . '--' . $post->{'post_slug'} . '.html',
'data' => $rendered,
};
# copy over information on downloads, if they exist
if ( $post->{'_download'} ) {
push $output->{'download'}->@*, $post->{'_download'}->@*;
}
return $output;
}
# go through the rendered HTML and find any img tags, and change
# them into data urls
sub convert_imgs {
my ($output) = @_;
my $ht = HTTP::Tiny->new();
my $_convert_imgs_helper = sub {
my ($url) = @_;
my $data = $ht->get($url);
if ( $data->{'success'} ) {
my $ct = $data->{'headers'}->{'content-type'};
return 'data:' . $ct . ';base64,' . encode_base64( $data->{'content'}, '' );
}
else {
return 'error: ' . $data->{'status'} . ' ' . $data->{'reason'};
}
};
$output->{'data'} =~ s/( ]* src=") ([^"]+) (" [^>]* > )/$1 . $_convert_imgs_helper->($2) . $3/gex;
}
# write the data out to the file system. if there are any downloads (ie, videos)
# download them as well
sub save_post {
my ($output) = @_;
if ( !-d $output->{'dir'} ) {
mkdir $output->{'dir'} or die("Could not create output directory '$output->{'dir'}': $!\n");
}
open( my $out, '>:encoding(utf-8)', $output->{'dir'} . '/' . $output->{'file'} )
or die("Could not create output file '$output->{'dir'}/$output->{'file'}': $!\n");
print $out $output->{'data'};
close($out) or die("Could not close output file '$output->{'dir'}/$output->{'file'}': $!\n");
if ( $output->{'download'} ) {
my $ht = HTTP::Tiny->new();
foreach my $dl ( $output->{'download'}->@* ) {
my $result = $ht->mirror( $dl->{'url'}, $output->{'dir'} . '/' . $dl->{'file'} );
if ( !$result->{'success'} ) {
warn("download failed for '$dl->{'url'}' to $dl->{'file'}: $result->{'status'} $result->{'reason'}\n");
}
}
}
}
# #################################################################################
# check to see if this post is sufficiently unencumbered that we can download just the
# photo data
sub should_download_only_photo {
my ($post_data) = @_;
my $p = $post_data->{'_post'};
if (1
and ( !defined( $p->{'photo-caption'} ) or ( $p->{'photo-caption'} eq '' ) ) # there is no caption
and ( !scalar( $p->{'photos'}->@* ) ) # there is only one photo
and ( !defined( $p->{'tags'} ) and !scalar( $p->{'tags'}->@* ) ) # there are no tags
)
{
return 1;
}
else {
return 0;
}
}
# download the photo directly to disk, skipping the whole HTML template stuff
sub download_only_photo {
my ($p) = @_;
# make sure there is an image to download
if ( !defined( $p->{'_post'}->{'photo-url-1280'} ) ) {
die("Could not find an image URL to download.\n");
}
# retrieve the image
my $ht = HTTP::Tiny->new();
my $data = $ht->get( $p->{'_post'}->{'photo-url-1280'} );
unless ( $data->{'success'} ) {
die( "Error retrieving photo only: " . $data->{'status'} . ' ' . $data->{'reason'} . "\n" );
}
# get the file name from the url
my $filename = ( $data->{'url'} =~ s{^.+/([^/]+)$}{$1}r );
# create an "output" object
my $output = {
'type' => 'img',
'dir' => $basedir . $p->{'tumblr_key'},
'file' => $p->{'post_id'} . '--' . $filename,
'data' => $data->{'content'},
};
save_post($output);
exit(0);
}
# #################################################################################
sub add_regular_data {
my ($post) = @_;
my $source = $post->{'_post'};
if ( defined( $source->{'regular-title'} ) and ( $source->{'regular-title'} ne '' ) ) {
$post->{'post_body'} = '
[% post_date %]
[% START post_tags %] [% END post_tags %]