Digital pack-ratting for Tumblr

Introduction

Ephemeral

I like to re-read books. And re-watch movies. It should be no surprise, then, that I also like to re-view things on the internet. Jokes, memes, inspiring pictures — these are all things that I like to go back to from time to time. However, things on the internet are temporary, because I don’t control the server they are on.

A common source of things that I want to save, especially images, in Tumblr. See, for example, Ruined Childhood, Star Trek Ships, Colour my world, and 99 % Invisible.

This talk describes a program I wrote to help me save some of these Tumblr posts for when I’m bored, depressed, or 90.

My goal

But how?

Perl 5 Code

A rewrite

Prolog

#!/usr/bin/env perl
use Modern::Perl '2012';
use experimental 'switch', 'postderef';
use utf8;

Modules

# handle command line parameters
use Getopt::Long;

# retrieve network data
use HTTP::Tiny;
use JSON::Tiny 'decode_json';

# template
use Data::Section -setup;
use Template::Simple;
use Encode;

# transcode images into data urls
use MIME::Base64 'encode_base64';
use Mojo::DOM58;

Command-line

# get command line options
my $url        = '';
my $basedir    = '.';
my $only_photo = 0;
GetOptions( "url=s" => \$url, "basedir=s" => \$basedir, "photo" => \$only_photo )
  or die("Could not parse options.\n");

# make sure we have a url
if ( $url eq '' ) { die("You must provide a url.\n"); }

# normalize the basedir
if ( $basedir !~ m[/$] ) { $basedir .= '/'; }

RSS

tell application "Vienna"
    try
        set article_url to link of current article
        do shell script "/Users/pukku/Pictures/tumblr/download_tumblr.pl -url=" & article_url
    on error errMsg
        display dialog "An error occurred: " & errMsg
    end try
end tell

Main flow

# get the tumblr data and the common elements
my $tumblr_data = get_tumblr_data($url);
my $post_data = get_common_data( $tumblr_data->{'tumblelog'}, $tumblr_data->{'posts'}[ 0 ] );

# if we can, or want to, download only the photo, short circuit the rest of the program
if ( ( $post_data->{'_type'} eq 'photo' ) and ( $only_photo or should_download_only_photo($post_data) ) ) {
    download_only_photo($post_data);
}

# specialize on the tumblr type
given ( $post_data->{'_type'} ) {
    ...;
}

# render the data
my $output = create_output_html($post_data);

# make all images data urls
convert_imgs($output);

# save the data
save_post($output);

get_tumblr_data

# given a url, presume it is a tumblr-compliant url.
# get the data in json format and decode it
sub get_tumblr_data {
    my ($url) = @_;
    $url .= '?format=json';    # get data as json

    my $data = HTTP::Tiny->new->get($url);
    if ( !$data->{'success'} ) {
        die( "Error retrieving post: " . $data->{'status'} . ' ' . $data->{'reason'} );
    }
    if ( $data->{'content'} !~ m/^var tumblr_api_read = / ) {
        die("Error retrieving data: doesn't appear to be a tumblr.");
    }

    # tumblr returns Javascript, not really JSON
    my $content = ( $data->{'content'} =~ s/^var tumblr_api_read = //r );
    $content =~ s/;$//;
    my $json = decode_json($content);

    # make sure it looks right, in case we got some other sort of json
    # or for some reason retrieved too many posts
    if ( !defined( $json->{'tumblelog'} ) ) {
        die("Malformed JSON data received.\n");
    }
    if ( !defined( $json->{'posts'} ) ) {
        die("Not enough posts received.\n");
    }
    if ( scalar( $json->{'posts'}->@* ) != 1 ) {
        die( "Too many posts received (" . scalar( $tumblr_data->{'posts'}->@* ) . ").\n" );
    }

    return $json;
}
( my $content = $data->{'content'} ) =~ s/^var tumblr_api_read = //;

get_common_data

# take the tumblr json and extract the elements we want to use
sub get_common_data {
    my ( $t, $p ) = @_;

    my $post = {
        '_tumblr' => $t,
        '_post'   => $p,
        '_type'   => $p->{'type'},

        'tumblr_key'   => $t->{'name'}  || 'unknown',
        'tumblr_title' => $t->{'title'} || 'Unknown',

        'post_id'   => $p->{'id'},
        'post_slug' => $p->{'slug'} || $p->{'type'},
        'post_url'  => $p->{'url-with-slug'} || $p->{'url'},
        'post_date' => $p->{'date-gmt'},
        'post_tags' => '',

        'post_body' => '',
    };

    # fix tags
    if ( defined( $p->{'tags'} ) and scalar( $p->{'tags'}->@* ) ) {
        $post->{'post_tags'} = { 'tags' => [ map { { 'tag' => $_ }; } $p->{'tags'}->@* ] };
    }

    return $post;
}

specialize on the type

# specialize on the tumblr type
given ( $post_data->{'_type'} ) {
    when ('regular') { add_regular_data($post_data); }
    when ('answer')  { add_answer_data($post_data); }
    when ('quote')   { add_quote_data($post_data); }
    when ('photo')   { add_photo_data($post_data); }
    when ('video')   { add_video_data($post_data); }

    default {
        die( "Unknown post type: " . $post_data->{'_type'} . "\n" );
    }
}

the straight-forward ones

sub add_regular_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    if ( defined( $source->{'regular-title'} ) and ( $source->{'regular-title'} ne '' ) ) {
        $post->{'post_body'} = '<h2>' . $source->{'regular-title'} . '</h2>' . "\n\n";
    }

    $post->{'post_body'} .= $source->{'regular-body'};
}

sub add_answer_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_body'} = ''
      . '<div class="answer-q">' . "\n" . $source->{'question'} . "\n" . '</div>' . "\n"
      . '<div class="answer-a">' . "\n" . $source->{'answer'} . "\n" . '</div>';
}

sub add_quote_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_body'} = ''
      . '<div class="quote-text">' . "\n" . $source->{'quote-text'} . "\n" . '</div>' . "\n"
      . '<div class="quote-source">' . "\n" . $source->{'quote-source'} . "\n" . '</div>';
}

add_photo_data

sub add_photo_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_photos'} = [];

    # if there is only one photo, tumblr provides the photo data at the top level
    # however, if there are multiple photos, it uses the photos array.
    if ( !scalar( $source->{'photos'}->@* ) ) {
        push $post->{'post_photos'}->@*, {
            'caption' => $source->{'photo-caption'},
            'url'     => $source->{'photo-url-1280'},
        };
    }
    else {
        foreach my $ph ( $source->{'photos'}->@* ) {
            push $post->{'post_photos'}->@*, {
                'caption' => $ph->{'caption'},
                'url'     => $ph->{'photo-url-1280'},
              }
        }
    }
}

add_video_data

sub add_video_data {
    my ($post) = @_;
    my $source = $post->{'_post'};

    $post->{'post_videos'} = [];

    # although this could be false, I assume that there is only one video
    # and only one source
    my $dom           = Mojo::DOM58->new( $source->{'video-player'} );
    my $video_wrapper = $dom->find('video[data-crt-options]')->[ 0 ];
    my $video_source  = $dom->find('video source')->[ 0 ];

    if ( $video_wrapper and $video_source ) {
        my $video_options = decode_json( $video_wrapper->{'data-crt-options'} );
        if ( $video_options->{'hdUrl'} ) {
            push $post->{'post_videos'}->@*, {
                'url'     => $video_options->{'hdUrl'},
                'name'    => $post->{'post_id'} . '.mp4',
                'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
                'caption' => $source->{'video-caption'},
            };
        }
        elsif ( ( $video_source->{'type'} eq 'video/mp4' ) and ( $video_source->{'src'} =~ m/video_file/ ) ) {
            push $post->{'post_videos'}->@*, {
                'url'     => $video_source->{'src'},
                'name'    => $post->{'post_id'} . '.mp4',
                'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
                'caption' => $source->{'video-caption'},
            };
        }
        else {
            warn( "Unsupported video type.\n" . $source->{'video-player'} . "\n" );
            push $post->{'post_videos'}->@*, {
                'url'     => '',
                'name'    => $post->{'post_id'},
                'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
                'caption' => $source->{'video-caption'},
            };
        }
    }
    else {
        warn( "Video data does not seem to make sense:\n" . $source->{'video-player'} . "\n" );
        push $post->{'post_videos'}->@*, {
            'url'     => '',
            'name'    => $post->{'post_id'},
            'source'  => $source->{'video-source'} . "\n" . $source->{'video-player'},
            'caption' => $source->{'video-caption'},
        };
    }

    foreach my $v ( $post->{'post_videos'}->@* ) {
        if ( $v->{'url'} ) {
            push $post->{'_download'}->@*, {
                'url'  => $v->{'url'},
                'file' => $v->{'name'},
            };
        }
        $v->{'controller'} = '<video src="' . $v->{'name'} . '" width="" height="" controls preload allowfullscreen></video>';
    }
}

create_output_html

# given post data, create an "output" collection with the rendered HTML
sub create_output_html {
    my ($post) = @_;

    my $tmpl     = Template::Simple->new();
    my $template = ${ main->section_data('wrapper') };
    my $rendered = ${ $tmpl->render( \$template, $post ) };

    # Template::Simple tends to leave behind whitespace
    $rendered =~ s/\n\t\n/\n/g;
    $rendered =~ s/\n\n\n+/\n\n/g;

    my $output = {
        'type' => 'html',
        'dir'  => $basedir . $post->{'tumblr_key'},
        'file' => $post->{'post_id'} . '--' . $post->{'post_slug'} . '.html',
        'data' => $rendered,
    };

    # copy over information on downloads, if they exist
    if ( $post->{'_download'} ) {
        push $output->{'download'}->@*, $post->{'_download'}->@*;
    }

    return $output;
}

the template

__DATA__

__[wrapper]__
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<meta name="origin" content="[% post_url %]" />
	<meta name="tumblr-type" content="[% _type %]" />
	[% START post_tags %]<meta name="keywords" content="tumblr[% START tags %], [% tag %][% END tags %]" />[% END post_tags %]
	<title>[% tumblr_title %][% post_id %][% post_slug %]</title>
<!-- #bbinclude "../tumblr.css" -->
	<style type="text/css">
		body {
			margin: 0;
			padding: 0;
			font-family: "Hoefler Text", serif;
		}
		article {
			margin: 0;
			padding: 0;
			padding: 0.5rem;
			background: #EEEEEE;
			border: 0.25rem solid #DDDDDD;
		}
		article header {
			margin: 0;
			margin-bottom: 1rem;
			padding: 0;
			border-bottom: thin solid #BBBBBB;
		}
		article header h1 {
			margin: 0;
			padding: 0;
			margin-top: 1rem;
			margin-bottom: 0.25rem;
			font-size: 1.2rem;
		}
		article header p.date {
			margin: 0;
			padding: 0;
			margin-top: 0.5rem;
			margin-bottom: 0.5rem;
			font-size: 0.8rem;
		}
		article header p.tags {
			margin: 0;
			padding: 0;
			margin-top: 0.5rem;
			margin-bottom: 0.5rem;
			font-size: 0.8rem;
			font-style: italic;
		}
		article header p.tags span.tag {
			padding-right: 1rem;
		}
		article header p.tags span.tag:before {
			content: '# ';
			color: gray;
		}
		article figure img {
			max-width: 100%;
		}
		article blockquote {
			margin-left: 1rem;
			border-left: thin dashed #CCCCCC;
			padding-left: 0.5rem;
			margin-right: 0;
		}
		video {
			max-width: 100%;
		}
	</style>
<!-- end bbinclude -->
	<style type="text/css">
	</style>
</head>
<body>
<article>
<header>
	<h1><a href="[% post_url %]">[% tumblr_title %][% post_id %][% post_slug %]</a></h1>
	<p class="date">[% post_date %]</p>
	[% START post_tags %]<p class="tags">[% START tags %]<span class="tag">[% tag %]</span>[% END tags %]</p>[% END post_tags %]
</header>
[% START post_photos %]
<figure>
	<a href="[% url %]"><img src="[% url %]" /></a>
	<figcaption>[% caption %]</figcaption>
</figure>
[% END post_photos %]
[% START post_videos %]
<!-- [% source %] -->
[% controller %]
[% caption %]
[% END post_videos %]
[% post_body %]
</article>
</body>
</html>

convert_imgs (version 1)

# go through the rendered HTML and find any img tags, and change
# them into data urls
sub convert_imgs {
    my ($output) = @_;

    my $html = $output->{'data'};
    my $dom  = Mojo::DOM58->new($html);
    my $ht   = HTTP::Tiny->new();
    $dom->find('img')->each(
        sub {
            my $src  = $_->{'src'};
            my $data = $ht->get($src);
            if ( $data->{'success'} ) {
                my $ct = $data->{'headers'}->{'content-type'};
                $_->{'src'} = 'data:' . $ct . ';base64,' . encode_base64( $data->{'content'}, '' );
            }
            else {
                $_->{'src'} = 'error: ' . $data->{'status'} . ' ' . $data->{'reason'};
            }
        }
    );
    $output->{'data'} = $dom->to_string();
}

convert_imgs (version 2)

# go through the rendered HTML and find any img tags, and change
# them into data urls
sub convert_imgs {
    my ($output) = @_;

    my $ht = HTTP::Tiny->new();

    my $_convert_imgs_helper = sub {
        my ($url) = @_;
        my $data = $ht->get($url);
        if ( $data->{'success'} ) {
            my $ct = $data->{'headers'}->{'content-type'};
            return 'data:' . $ct . ';base64,' . encode_base64( $data->{'content'}, '' );
        }
        else {
            return 'error: ' . $data->{'status'} . ' ' . $data->{'reason'};
        }
    };

    $output->{'data'} =~ s/( <img [^>]* src=") ([^"]+) (" [^>]* > )/$1 . $_convert_imgs_helper->($2) . $3/gex;
}

save_post

# write the data out to the file system. if there are any downloads (ie, videos)
# download them as well
sub save_post {
    my ($output) = @_;

    if ( !-d $output->{'dir'} ) {
        mkdir $output->{'dir'} or die("Could not create output directory '$output->{'dir'}': $!\n");
    }
    my $out;
    if ( $output->{'type'} eq 'html' ) {
        open( $out, '>:encoding(utf-8)', $output->{'dir'} . '/' . $output->{'file'} )
          or die("Could not create output file '$output->{'dir'}/$output->{'file'}': $!\n");
    }
    else {
        open( $out, '>', $output->{'dir'} . '/' . $output->{'file'} )
          or die("Could not create output file '$output->{'dir'}/$output->{'file'}': $!\n");
    }
    print $out $output->{'data'};
    close($out) or die("Could not close output file '$output->{'dir'}/$output->{'file'}': $!\n");

    if ( $output->{'download'} ) {
        my $ht = HTTP::Tiny->new();
        foreach my $dl ( $output->{'download'}->@* ) {
            my $result = $ht->mirror( $dl->{'url'}, $output->{'dir'} . '/' . $dl->{'file'} );
            if ( !$result->{'success'} ) {
                warn("download failed for '$dl->{'url'}' to $dl->{'file'}: $result->{'status'} $result->{'reason'}\n");
            }
        }
    }
}

Downloading only the photos…

# if we can, or want to, download only the photo, short circuit the rest of the program
if ( ( $post_data->{'_type'} eq 'photo' ) and ( $only_photo or should_download_only_photo($post_data) ) ) {
    download_only_photo($post_data);
}

should_download_only_photo

# check to see if this post is sufficiently unencumbered
# that we can download just the photo data
sub should_download_only_photo {
    my ($post_data) = @_;
    my $p = $post_data->{'_post'};
    if (1
        and ( !defined( $p->{'photo-caption'} ) or ( $p->{'photo-caption'} eq '' ) )    # there is no caption
        and ( !scalar( $p->{'photos'}->@* ) )                                           # there is only one photo
        and ( !defined( $p->{'tags'} ) or !scalar( $p->{'tags'}->@* ) )                # there are no tags
      )
    {
        return 1;
    }
    else {
        return 0;
    }
}

download_only_photo

# download the photo directly to disk, skipping the whole HTML template stuff
sub download_only_photo {
    my ($p) = @_;

    # make sure there is an image to download
    if ( !defined( $p->{'_post'}->{'photo-url-1280'} ) ) {
        die("Could not find an image URL to download.\n");
    }

    # retrieve the image
    my $ht   = HTTP::Tiny->new();
    my $data = $ht->get( $p->{'_post'}->{'photo-url-1280'} );
    unless ( $data->{'success'} ) {
        die( "Error retrieving photo only: " . $data->{'status'} . ' ' . $data->{'reason'} . "\n" );
    }

    # get the file name from the url
    my $filename = ( $data->{'url'} =~ s{^.+/([^/]+)$}{$1}r );

    # create an "output" object
    my $output = {
        'type' => 'img',
        'dir'  => $basedir . $p->{'tumblr_key'},
        'file' => $p->{'post_id'} . '--' . $filename,
        'data' => $data->{'content'},
    };

    save_post($output);
    exit(0);
}

Perl 6 code

A challenge

Great community!

Prolog

#!/usr/bin/env perl6
use v6;
sub croak { note $^msg; exit(1); } # because Perl 6 doesn't have the Perl 5 "\n" magic for die

Modules

# retrieve network data
use HTTP::UserAgent;
use JSON::Tiny;

# template
use Template::Mustache;

# transcode images into data urls
use MIME::Base64;

Command-line & Main flow

sub MAIN (Str :$url!, Str :$base-directory = './', Bool :$photo = False) {

    # change directory to the appropriate base directory
    (try chdir $base-directory) orelse croak("Couldn't change to the base directory '$base-directory': $!");

    # retrieve the data from the web and extract the common elements
    my %tdata = get-tumblr-data($url);
    my %pdata = get-common-data(%tdata);

    # @NOTE not implemented: handling download only photo

    # add the special elements for the particular post type
    add-tumblr-data(%pdata<_type>, %pdata);

    # create an "output" object(ish)
    my %output = create-output-html(%pdata);

    # turn all imgs into data URIs
    convert-imgs(%output);

    # save the output
    save-post(%output);
}

(another error handling syntax)

try {
    chdir $base-directory;
    CATCH {
        when X::IO { croak("Couldn't change to the base directory '$base-directory': $_") }
    }
}

get-tumblr-data

# retrieve the JSON(ish) data for the post
sub get-tumblr-data (Str $url) {
    my $data = HTTP::UserAgent.new.get($url ~ '?format=json');
    $data.is-success                               or croak("HTTP error retrieving post: {$data.status-line}.");
    $data.content ~~ m:s/^var tumblr_api_read \= / or croak("Error retrieving data: doesn't appear to be a tumblr.");

    # Tumblr actually returns a JavaScript snippet, which we want to turn into JSON
    my $content = $data.content.chomp;
    $content ~~ s:s/^var tumblr_api_read \= //;
    $content ~~ s/\;$//;

    my %json = from-json($content);

    %json<tumblelog>          or croak("Malformed JSON data received.");
    %json<posts>              or croak("Not enough posts received.");
    %json<posts>:v.elems == 1 or croak("Too many posts received ({%json<posts>:v.elems}).");

    return %json;
}

get-common-data

# extract the data pieces that we will want for all post types
sub get-common-data (%tdata) {
    my %t = %tdata<tumblelog>;
    my %p = %tdata<posts>[0];

    my %post = (
        _tumblr => %t,
        _post   => %p,
        _type   => %p<type>,

        tumblr_key   => %t<name>  || 'unknown',
        tumblr_title => %t<title> || 'unknown',

        post_id   => %p<id>,
        post_slug => %p<slug>          || %p<type>,
        post_url  => %p<url-with-slug> || %p<url>,
        post_date => %p<date-gmt>,
        post_tags => {},

        post_body => '',
    );

    # fix tags
    %p<tags> and %post<post_tags><tags> = [ %p<tags>.list.map: { tag => $_ } ];

    return %post;
}

specializing on the type

# generic handler for unknown post types
multi add-tumblr-data ($type, %pdata) {
    croak("Unimplemented post type: {%pdata<_type>}.");
}

the straight-forward ones

multi add-tumblr-data ($ where 'regular', %pdata) {
    my %s = %pdata<_post>;

    if %s<regular-title> -> $title {
        %pdata<post_body> = "<h2>$title</h2>\n\n";
    }

    %pdata<post_body> ~= %s<regular-body>;
}

Photos

multi add-tumblr-data ($ where 'photo', %pdata) {
    my %s = %pdata<_post>;

    %pdata<post_photos> = [];

    if !%s<photos> {
        %pdata<post_photos>.push: {
            caption => %s<photo-caption>,
            url     => %s<photo-url-1280>,
        };
    }
    else {
        for |%s<photos> -> %ph {
            %pdata<post_photos>.push: {
                caption => %ph<caption>,
                url     => %ph<photo-url-1280>,
            };
        }
    }
}

Video

multi add-tumblr-data ($ where 'video', %pdata) {
    croak("I'm not implementing video for this talk.");
}

create-output-html

# create a spec for saving the data to disk, rendering off the template
sub create-output-html (%pdata) {
    my %output = (
        type => 'html',
        dir  => %pdata<tumblr_key> ~ '/',
        file => %pdata<post_id> ~ '--' ~ %pdata<post_slug> ~ '.pl6.html',
        data => Template::Mustache.render($=finish, %pdata),
    );

    # @NOTE not implemented download handling

    return %output;
}

The mustache template

=finish
<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="utf-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<meta name="origin" content="{{ post_url }}" />
	<meta name="tumblr-type" content="{{ _type }}" />
	{{# post_tags }}<meta name="keywords" content="tumblr{{# tags }}, {{ tag }}{{/ tags }}" />{{/ post_tags }}
	<title>{{ tumblr_title }}{{ post_id }}{{ post_slug }}</title>
<!-- #bbinclude "../tumblr.css" -->
	<style type="text/css">
		body {
			margin: 0;
			padding: 0;
			font-family: "Hoefler Text", serif;
		}
		article {
			margin: 0;
			padding: 0;
			padding: 0.5rem;
			background: #EEEEEE;
			border: 0.25rem solid #DDDDDD;
		}
		article header {
			margin: 0;
			margin-bottom: 1rem;
			padding: 0;
			border-bottom: thin solid #BBBBBB;
		}
		article header h1 {
			margin: 0;
			padding: 0;
			margin-top: 1rem;
			margin-bottom: 0.25rem;
			font-size: 1.2rem;
		}
		article header p.date {
			margin: 0;
			padding: 0;
			margin-top: 0.5rem;
			margin-bottom: 0.5rem;
			font-size: 0.8rem;
		}
		article header p.tags {
			margin: 0;
			padding: 0;
			margin-top: 0.5rem;
			margin-bottom: 0.5rem;
			font-size: 0.8rem;
			font-style: italic;
		}
		article header p.tags span.tag {
			padding-right: 1rem;
		}
		article header p.tags span.tag:before {
			content: '# ';
			color: gray;
		}
		article figure img {
			max-width: 100%;
		}
		article blockquote {
			margin-left: 1rem;
			border-left: thin dashed #CCCCCC;
			padding-left: 0.5rem;
			margin-right: 0;
		}
		video {
			max-width: 100%;
		}
	</style>
<!-- end bbinclude -->
	<style type="text/css">
	</style>
</head>
<body>
<article>
<header>
	<h1><a href="{{ post_url }}">{{ tumblr_title }}{{ post_id }}{{ post_slug }}</a></h1>
	<p class="date">{{ post_date }}</p>
	{{# post_tags }}<p class="tags">{{# tags }}<span class="tag">{{ tag }}</span>{{/ tags }}</p>{{/ post_tags }}
</header>
{{# post_photos }}
<figure>
	<a href="{{ url }}"><img src="{{ url }}" /></a>
	<figcaption>{{& caption }}</figcaption>
</figure>
{{/ post_photos }}
{{# post_videos }}
<!-- {{& source }} -->
{{ controller }}
{{& caption }}
{{/ post_videos }}
{{& post_body }}
</article>
</body>
</html>

convert-imgs

# change all img tag sources to use data URIs
sub convert-imgs (%output) {
    my $ua = HTTP::UserAgent.new();

    sub convert-imgs-helper ($url) {
        my $r = $ua.get: ~$url;
        if $r.is-success {
            return "data:{$r.content-type};base64," ~ MIME::Base64.encode($r.content, :oneline);
        }
        else {
            return "error: {$r.response-line}";
        }
    }

    %output<data> ~~ s:g/ <?after \<img .+? src\=\"> (<-["]>+) /{ convert-imgs-helper($0) }/;
}

or going whole-hog on Perl 6

sub convert-imgs (% (:$data! is rw)) {
    $data .= subst: :g,
        / <?after \<img .+? src\=\"> (<-["]>+) /,
        -> $ ( Str() $url ) {
            with HTTP::UserAgent.new.get: $url {
                when .is-success {
                    "data:{.content-type};base64,"
                        ~  MIME::Base64.encode: .content, :oneline
                }
                "error: {.response-line}"
            }
        }
}

save-post

# write the file to the disk, in the appropriate directory
sub save-post (%output) {
    if !%output<dir>.IO.d {
        (try mkdir %output<dir>) orelse croak("Could not create output directory '%output<dir>': $!");
    }
    with %output<dir file>.join -> $f {
        (try spurt $f, %output<data>) orelse croak("Could not write output file '$f': $!");
    }

    # @NOTE not implemented download handling
}

Questions?

Code highlighting

BBEdit inline Perl to HTML

#!/usr/bin/env perl
use Modern::Perl '2014';
use Perl::Tidy;

# grab the input data
my $src = do { local $/; <> };

# beautify it
my $beauty = undef;
Perl::Tidy::perltidy(
    source      => \$src,
    destination => \$beauty,
    argv        => [
        '--noprofile',                      # ignore a .perltidyrc; all settings come from this file
        '--quiet',                          # don't mess things up if something goes wrong
        '--maximum-line-length=0',          # I will take care about splitting lines where I want them...
        '--ignore-side-comment-lengths',    # don't include the length of the comments in the line length
        '--indent-columns=4',               # use 4 columns for a tab stop
        '--nooutdent-long-quotes',          # long quotes should not be outdented, even if they exceed line length
        '--nooutdent-long-comments',        # leave long comment lines alone
        '--add-semicolons',                 # make sure all blocks are terminated by a semi-colon
        '--keep-interior-semicolons',       # there's usually a really good reason for multiple commands on one line
        '--stack-opening-tokens',           # I prefer opening tokens to be inline
        '--square-bracket-tightness=0',     # I like single items in square brackets to have spaces
        '--nodelete-old-newlines',
        '--keep-old-blank-lines=1',
    ],
);

# html format it
my $html = undef;
Perl::Tidy::perltidy(
    source      => \$beauty,
    destination => \$html,
    argv        => [
        '-html',                            # produce html output
        '-pre',                             # but only the actual code, not the entire page
    ],
);

# print the result
print $html;

The same, for Perl 6

#!/usr/bin/env perl
use Modern::Perl '2015';
use Text::VimColor;

# grab the input data
my $src = do { local $/; <> };

my $syntax = Text::VimColor->new(
    string            => $src,
    filetype          => 'perl6',
    all_syntax_groups => 1,
);

say '<pre>' . $syntax->html . '</pre>';

The presentation code

I tried a variety of web-presentation projects, but all of them want the slides to be a fixed height. For this presentation, I wanted to be able to have code snippets of varying length, but not have to worry about either having to scroll a tiny window, or having the font made too small.

Instead, I spent a tiny bit of time developing a presentation “framework” in JavaScript.

Because this is not Perl, and not the focus of the talk, I'm not going to go through it, but here is the HTML required for the framework (minus some of the unimportant styles).

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8" />
    <title>{title}</title>
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />

    <style type="text/css">
        /* set some sensible defaults */
        * { box-sizing: border-box; }
        html {
            font-size: 20px;
        }
        body {
            margin: 0; padding: 0;
            font-size: 1rem; line-height: 1.5;
        }

        /* all headings should have the same height, even as the font sizes get smaller */
        h1, h2, h3, h4, h5, h6 {
            margin: 0; padding: 0;
            line-height: 3rem;
        }
        h1 { font-size: 2rem; }
        h2 { font-size: 1.7rem; }
        h3 { font-size: 1.5rem; }
        /* ... */

        /* fix the buttons based on the viewport, and make them almost invisible */
        button {
            padding: 0;
            border: thin solid gray;
            background-color: white;
            font-size: 2rem;
            line-height: 1;
            opacity: 0.1;
            text-align: center;
            -webkit-appearance: none;
        }
        button#dtoc {
            position: fixed;
            bottom: 0;
            left: 0;
            width: 2rem;
            height: 2rem;
        }
        button#next {
            position: fixed;
            bottom: 0;
            right: 0;
            width: 6rem;
            height: 2rem;
        }
        button#prev {
            position: fixed;
            bottom: 0;
            right: 6.5rem;
            width: 6rem;
            height: 2rem;
        }

        /* the nav needs to display items slightly differently, and make the headings indent */
        nav {
            padding: 0.5rem;
            position: fixed;
            top: 0;
            bottom: 2rem;
            left: 0;
            width: 25%;
            overflow-x: hidden;
            overflow-y: auto;
            background-color: #EEEEEE;
            border-right: thin solid black;
            border-bottom: thin solid black;
            white-space: nowrap;
        }
        nav { display: none; }
        nav.visible { display: block; }
        nav h1, nav h2, nav h3, nav h4, nav h5, nav h6 {
            font-size: 0.6rem;
            line-height: 1.5;
            font-weight: normal;
            font-style: normal;
        }
        nav h2 { padding-left: 1rem; }
        nav h3 { padding-left: 2rem; }
        /* ... */

        /* styles for the slides */
        main {
            margin: 0;
            padding: 0;
        }
        main section {
            margin: 1rem;
            margin-bottom: 2.5rem;
        }

        code {
            padding: 0.1rem 0.5rem;
        }

        div.code {
            margin-left: 1.5rem;
            max-width: 100%;
            overflow: auto;
            border: thin solid #DDDDDD;
        }
        div.code pre {
            margin: 0;
            font-family: monospace;
        }
    </style>

    <style type="text/css">
        /* the styles that make this a presentation vs just a webpage */
        main.slides section.slide {
            display: none;
        }
        main.slides section.slide.active {
            display: block;
        }
    </style>

    <script type="text/javascript">
        /* show or hide the table of contents */
        function toggle_toc () {
            document.getElementById("toc").classList.toggle("visible");
        }

        /* go forward one slide */
        function go_next () {
            var currelt = document.querySelector(".active");
            var nextelt = currelt.nextElementSibling;
            if (nextelt != null) {
                currelt.classList.remove("active");
                nextelt.classList.add("active");
                nextelt.scrollIntoView();
            }
        }

        /* go backward one slide */
        function go_prev () {
            var currelt = document.querySelector(".active");
            var prevelt = currelt.previousElementSibling;
            if (prevelt != null) {
                currelt.classList.remove("active");
                prevelt.classList.add("active");
                prevelt.scrollIntoView();
            }
        }

        /* make a particular slide (specified by id) the active one */
        function go_slide (sid) {
            var currelt = document.querySelector(".active");
            var destelt = document.getElementById(sid);
            if ((currelt != null) && (destelt != null)) {
                currelt.classList.remove("active");
                destelt.classList.add("active");
                destelt.scrollIntoView();
                document.getElementById("toc").classList.remove("visible");
            }
        }

        /* set up everything */
        function initialize_presentation () {
            // allow to display as one page, for easier development
            if (window.location.search == '?showall') {
                document.getElementById("presentation").classList.remove("slides");
            }

            // set the first slide to be the active one
            var first_slide = document.querySelector("main#presentation section.slide");
            first_slide.classList.add("active");

            // ad-hoc class to generate unique ids
            var idgen = {
                "curr": 0,
                "id": function (prefix) {
                    this.curr += 1;
                    return prefix.toString() + this.curr.toString();
                }
            };

            // go through the slides and assign each a unique id
            // also, create the table of contents
            var nav = document.getElementById("toc");
            var slides = document.querySelectorAll("main#presentation section.slide");
            slides.forEach(function (val, idx, list, t) {
                // assign a unique id to each slide
                if (val.id == "") { val.id = idgen.id("s"); }

                // get the first heading in each slide and add it to the table of contents
                var heading = val.querySelector("h1, h2, h3, h4, h5, h6");
                if (heading != null) {
                    var node = heading.cloneNode(true);
                    node.dataset.target = val.id;
                    node.addEventListener("click", function () { go_slide(val.id); });
                    nav.appendChild(node);
                }
            });
        }
    </script>

</head>
<body>
<button id="dtoc" onclick="toggle_toc();"></button>
<button id="next" onclick="go_next();">➡︎</button>
<button id="prev" onclick="go_prev();">⬅︎</button>

<nav id="toc"></nav>

<main id="presentation" class="slides">

    <section class="slide">
        <h1>{title}</h1>
    </section>

    <section class="slide">
        <h2>{another title}</h2>

        <p>{your text here}</p>
    </section>

    <!-- etc... -->

</main>

<script type="text/javascript">
initialize_presentation();
</script>

</body>
</html>

Fin