download.pl

#!/usr/bin/env perl
use Modern::Perl '2020';
use open qw(:std :utf8);
use WWW::Mechanize;
use Syntax::Keyword::Try;
use Mojo::DOM58;
use Path::Tiny;
use POSIX;
use Data::Dump 'pp';

my $work = ( shift @ARGV ) // '';    # check to see if we are passed a url

if ($work) {
    $work =~ s/^https:\/\/archiveofourown\.org//;
}

my $datetime = POSIX::strftime( '%Y-%m-%d_%H%M', localtime() );
my $lastdate = [ path('./rundates.txt')->lines_utf8( { chomp => 1, count => -1 } ) ]->[ 0 ];

if ( !$work ) {
    ## if we were not passed a URL, push the current run date to the tracking log
    path('./rundates.txt')->append_utf8("${datetime}\n");
}

my $mech = WWW::Mechanize->new();

# log in
my $user = 'user';
my $pass = 'pass';

$mech->get('https://archiveofourown.org/users/login');
$mech->submit_form(
    'form_id' => 'new_user',
    'fields'  => {
        'user[login]'    => $user,
        'user[password]' => $pass,
    }
);

say "logged in";

my $urls = [];
if ($work) {    # only grab the one work we've been passed
    push @$urls, [ "cli specified work", $work ];
}
else {    # grab all bookmarks
    my $burl = 'https://archiveofourown.org/users/' . $user . '/bookmarks?page=';
    my $i    = 1;
    my $lis  = [];
    while (1) {
        try {
            $mech->get( $burl . $i );
            my $dom = Mojo::DOM58->new( $mech->content() );
            my $ol  = $dom->at('ol.bookmark.index.group');
            if ($ol) {
                my $li = $ol->find('li.bookmark');
                if ( $li->size() ) {
                    print "$i .. ";
                    $li->map( sub { push @$lis, Mojo::DOM58->new($_); } );
                    say "done";
                }
                else {
                    last;
                }
            }
            else {
                say "could not find ol on page $i";
                last;
            }
            $i += 1;
        }
        catch ($e) {
            warn "error: $e\n";
        }
        sleep 30;    # maximum of 30 pages in 10 minutes for bookmarks, so sleep to not run into this.
    }
    say "done retrieving bookmarks";

    my $bookmarks = '';
    foreach my $b (@$lis) {
        my $res = process_bookmark($b);
        my ( $loc, $text ) = @$res;
        if (@$loc) { push @$urls, $loc; }
        $bookmarks .= "\t" . $text . "\n";
    }

    my $out = <<~EOHTML;
	<!DOCTYPE html>
	<html lang="en">
	<head>
		<meta charset="utf-8" />
		<title>AO3 Bookmarks for ${datetime}</title>
		<link rel="stylesheet" href="../style.css" type="text/css" />
	</head>
	<body class="bookmarks">
	<h1>AO3 Bookmarks for ${datetime}</h1>
	<ul class="bookmarks">
	$bookmarks
	</ul>
	</body>
	</html>
	EOHTML

    path( 'bookmarks/ao3bookmarks_' . $datetime . '.html' )->spew_utf8($out);
}

# go through the URLs, and retrieve each work
my $errors  = [];
my $changed = [];
while ( my $item = shift @$urls ) {
    my ( $title, $url ) = @$item;
    if ( $url =~ m[/series/(\d+)] ) {
        my $id   = $1;
        my $path = 'series/' . $id . '.html';
        print "saving series $id ($title) .. ";
        try {
            $mech->get( 'https://archiveofourown.org' . $url );
            my $res  = process_series( $mech->content() );
            my $text = shift @$res;
            path($path)->spew_utf8($text);
            unshift @$urls, @$res;    # add the work urls to the list of items to process
            say "done (" . scalar(@$urls) . " remaining)";
        }
        catch ($e) {
            push @$errors, "Error saving series $id ($title) <$url>: $e";
            say "error: $e";
        }
    }
    elsif ( $url =~ m[/works/(\d+)] ) {
        my $id   = $1;
        my $path = 'works/' . $id . '.html';
        print "saving work $id ($title) .. ";
        try {
            $mech->get( 'https://archiveofourown.org' . $url . '?view_full_work=true' );
            my $text        = process_work( $id, $mech->content() );
            my $save_images = 1;
            if ( path($path)->exists() ) {    # make sure we aren't overwriting things
                my $old = path($path)->slurp_utf8();
                if ( $old ne $text ) {
                    path($path)->move( 'works/' . $id . '-' . $lastdate . '.html' );    # move the existing file to have the last run date
                    $path = 'works/' . $id . '-' . $datetime . '.html';                 # save the new file with the date
                    push @$changed, "$id ($title) changed; saved as $path";
                }
                else {
                    $save_images = 0;
                }
            }
            path($path)->spew_utf8($text);
            if ($save_images) {
                ## TODO: save images
            }
            say "done (" . scalar(@$urls) . " remaining)";
        }
        catch ($e) {
            push @$errors, "Error saving $id ($title) <$url>: $e";
            say "error: $e";
        }
    }
    elsif ( $url =~ m[/external_works/(\d+)] ) {
        my $id = $1;
        push @$errors, "external work '$title' ($url) not saved";
    }
    else {
        push @$errors, "unknown url type $url";
    }

    sleep 5;    # placate the AO3 time limits (max of 120 pages in 6 minutes)
}
say "done retrieving works";
if (@$changed) {
    say "Changed:";
    print "\t" . join( "\n\t" => @$changed ) . "\n";
}
if (@$errors) {
    say "Errors:";
    print "\t" . join( "\n\t" => @$errors ) . "\n";
}

# process a bookmark
sub process_bookmark {
    my $b = shift;

    $b->find('div.dynamic')->map('remove');                        # get rid of some small empty divs
    $b->descendant_nodes->grep( sub { $_->type eq 'comment' } )    # get rid of any HTML comments
      ->map('remove');

    # remove status paragraph
    $b->find('p.status')->map('remove');

    # our later transformation will remove some spaces that are nicer to keep
    # so fix this here
    $b->find('div.header.module h4.heading')->first( sub {
        $_->child_nodes()->map( sub {
            if ( $_->type() eq 'text' ) {
                my $content = $_->content();
                $content =~ s/^\s*|\s*$//g;    # strip external spaces
                if ( $content ne '' ) {
                    $_->content( '[|space|]' . $content . '[|space|]' );    # add a placeholder
                }
            }
        } );
    } );

    # simplify required tags
    $b->find('div.header.module ul.required-tags li')->map( sub { $_->content( $_->all_text() ); } );

    # remove other tag destinations
    $b->find('a.tag')->map( sub { delete $_->attr->{'href'}; } );

    # remove things that are likely to change in the stats
    $b->find('dl.stats')->map( sub {
        $_->find('.collections, .comments, .kudos, .bookmarks, .hits')
          ->map('remove');
    } );

    $b->find('div.own.user.module.group')->map( sub {
        $_->at('h5.byline.heading')->remove();                      # get rid of the fact that I bookmarked it
        $_->at('p.datetime')->prepend_content('Bookmarked on ');    # make the purpose of the date obvious
        $_->at('ul.actions')->remove();                             # get rid of "delete bookmark" form
    } );

    # extract the url
    # NB: this presumes that the relavent url is the first one!
    my $loc = [];
    $b->find('div.header.module h4.heading a')->first( sub {
        $loc = [ $_->text(), $_->attr->{'href'} ];
    } );

    # turn it into a single line string
    my $text = $b->to_string();
    $text =~ s/\n\s*//g;

    # fix some things
    $text =~ s(\[\|space\|\])( )g;                                                        # replace the placeholder for spaces
    $text =~ s[<dt>Bookmarks:</dt><dd><a href="/series/\d+/bookmarks">\d+</a></dd>][];    # series don't label the bookmarks count properly

    return [ $loc, $text ];
}

# process a series
sub process_series {
    my $html = shift;
    my $dom  = Mojo::DOM58->new($html);
    my $urls = [];

    # pull out the parts we want
    my $title    = $dom->at('head title')->all_text();
    my $heading  = $dom->at('h2.heading')->all_text();
    my $meta_dom = Mojo::DOM58->new( $dom->at('div#main div.wrapper dl.series.meta.group') );
    my $mark_dom = Mojo::DOM58->new( $dom->at('ul.series.work.index.group') );

    # strip surrounding whitespace from the title
    $title   =~ s/^\s+|\s+$//g;
    $heading =~ s/^\s+|\s+$//g;

    # clean up the meta data
    $meta_dom->descendant_nodes->grep( sub { $_->type eq 'comment' } )    # get rid of any HTML comments
      ->map('remove');
    my $meta_text = $meta_dom->to_string();
    $meta_text =~ s/\s*\n\s*//g;
    $meta_text =~ s[<dt>Bookmarks:</dt><dd><a href="/series/\d+/bookmarks">\d+</a></dd>][];    # series don't label the bookmarks count properly

    # process the series items
    my $res = [];
    $mark_dom->at('ul.series.work.index.group')->children('li')                                # run through each work in the series and treat it as a bookmark
      ->map( sub { push @$res, process_bookmark( Mojo::DOM58->new($_) ); } );

    my $bookmarks = '';
    foreach my $r (@$res) {
        my ( $loc, $text ) = @$r;
        if (@$loc) { push @$urls, $loc; }
        $bookmarks .= "\t" . $text . "\n";
    }

    # generate a very basic HTML wrapper
    my $text = <<~EOHTML;
	<!DOCTYPE html>
	<html lang="en">
	<head>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1.0" />
		<title>${title}</title>
		<link rel="stylesheet" href="../style.css" type="text/css" />
	</head>
	<body class="series">
	<h1>${heading}</h1>
	<div class="meta">
		${meta_text}
	</div>
	<ul class="bookmarks">
	${bookmarks}
	</ul>
	</body>
	</html>
	EOHTML

    return [ $text, @$urls ];
}

# process a work
sub process_work {
    my ( $id, $html ) = @_;
    my $dom = Mojo::DOM58->new($html);

    # pull out the parts we want
    my $title    = $dom->at('head title')->all_text();
    my $meta_dom = Mojo::DOM58->new( $dom->at('div#main div.wrapper dl.work.meta.group') );
    my $work_dom = Mojo::DOM58->new( $dom->at('div#workskin') );

    # strip surrounding whitespace from the title
    $title =~ s/^\s+|\s+$//g;

    # clean up the meta data
    $meta_dom->descendant_nodes->grep( sub { $_->type eq 'comment' } )    # get rid of any HTML comments
      ->map('remove');
    $meta_dom->find('dt.warning.tags')->map( sub {                        # clean up an annoying link I don't want
        my $t = $_->content();
        $t =~ s[<a href="[^"]+">][];
        $t =~ s[</a>][];
        $_->content($t);
    } );
    $meta_dom->find('a.tag')->map( sub { delete $_->attr->{'href'}; } );    # remove link destinations from tags
    $meta_dom->find('.collections')->map('remove');
    $meta_dom->find('dl.stats')->map( sub {                                 # get rid of things that change
        $_->find('.comments, .kudos, .bookmarks, .hits')
          ->map('remove');
    } );

    # clean up the actual text
    $work_dom->descendant_nodes->grep( sub { $_->type eq 'comment' } )      # get rid of any HTML comments
      ->map('remove');
    $work_dom->at('div#workskin')->tap( sub {                               # change to 'main' tag and fix up class/id
        $_->tag('main');
        $_->attr( 'class' => 'work' );
        delete $_->attr->{'id'};
    } );
    $work_dom->find('h3.landmark.heading')->map('remove');                  # remove an unnecessary (hidden) heading

    # fix the chapter endnote links, which annoyingly swap format seemingly at random
    $work_dom->find('div.chapter div.notes a')->each( sub {
        my $e = shift;
        if ( defined( $e->{'href'} ) and ( $e->{'href'} =~ m/(#chapter_\d+_endnotes)$/ ) ) {
            $e->{'href'} = $1;
        }
    } );

    #stringify
    my $meta = $meta_dom->to_string();
    my $work = $work_dom->to_string();

    # generate a very basic HTML wrapper
    my $text = <<~EOHTML;
	<!DOCTYPE html>
	<html lang="en">
	<head>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1.0" />
		<title>${title}</title>
		<link rel="canonical" href="https://archiveofourown.org/works/$id" />
		<link rel="stylesheet" href="../style.css" type="text/css" />
	</head>
	<body class="work">
	<h1>${title}</h1>
	<div class="meta">
	${meta}
	</div>
	${work}
	</body>
	</html>
	EOHTML

    $text = Mojo::DOM58->new($text)->to_string();    # run it through Mojo::DOM58 to canonicalize the attribute order
                                                     # (this helps with helper programs)

    return $text;
}