#!/usr/bin/env perl use Modern::Perl '2020'; use open qw(:std :utf8); use WWW::Mechanize; use Syntax::Keyword::Try; use Mojo::DOM58; use Path::Tiny; use POSIX; use Data::Dump 'pp'; my $work = ( shift @ARGV ) // ''; # check to see if we are passed a url if ($work) { $work =~ s/^https:\/\/archiveofourown\.org//; } my $datetime = POSIX::strftime( '%Y-%m-%d_%H%M', localtime() ); my $lastdate = [ path('./rundates.txt')->lines_utf8( { chomp => 1, count => -1 } ) ]->[ 0 ]; if ( !$work ) { ## if we were not passed a URL, push the current run date to the tracking log path('./rundates.txt')->append_utf8("${datetime}\n"); } my $mech = WWW::Mechanize->new(); # log in my $user = 'user'; my $pass = 'pass'; $mech->get('https://archiveofourown.org/users/login'); $mech->submit_form( 'form_id' => 'new_user', 'fields' => { 'user[login]' => $user, 'user[password]' => $pass, } ); say "logged in"; my $urls = []; if ($work) { # only grab the one work we've been passed push @$urls, [ "cli specified work", $work ]; } else { # grab all bookmarks my $burl = 'https://archiveofourown.org/users/' . $user . '/bookmarks?page='; my $i = 1; my $lis = []; while (1) { try { $mech->get( $burl . $i ); my $dom = Mojo::DOM58->new( $mech->content() ); my $ol = $dom->at('ol.bookmark.index.group'); if ($ol) { my $li = $ol->find('li.bookmark'); if ( $li->size() ) { print "$i .. "; $li->map( sub { push @$lis, Mojo::DOM58->new($_); } ); say "done"; } else { last; } } else { say "could not find ol on page $i"; last; } $i += 1; } catch ($e) { warn "error: $e\n"; } sleep 30; # maximum of 30 pages in 10 minutes for bookmarks, so sleep to not run into this. } say "done retrieving bookmarks"; my $bookmarks = ''; foreach my $b (@$lis) { my $res = process_bookmark($b); my ( $loc, $text ) = @$res; if (@$loc) { push @$urls, $loc; } $bookmarks .= "\t" . $text . "\n"; } my $out = <<~EOHTML; <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8" /> <title>AO3 Bookmarks for ${datetime}</title> <link rel="stylesheet" href="../style.css" type="text/css" /> </head> <body class="bookmarks"> <h1>AO3 Bookmarks for ${datetime}</h1> <ul class="bookmarks"> $bookmarks </ul> </body> </html> EOHTML path( 'bookmarks/ao3bookmarks_' . $datetime . '.html' )->spew_utf8($out); } # go through the URLs, and retrieve each work my $errors = []; my $changed = []; while ( my $item = shift @$urls ) { my ( $title, $url ) = @$item; if ( $url =~ m[/series/(\d+)] ) { my $id = $1; my $path = 'series/' . $id . '.html'; print "saving series $id ($title) .. "; try { $mech->get( 'https://archiveofourown.org' . $url ); my $res = process_series( $mech->content() ); my $text = shift @$res; path($path)->spew_utf8($text); unshift @$urls, @$res; # add the work urls to the list of items to process say "done (" . scalar(@$urls) . " remaining)"; } catch ($e) { push @$errors, "Error saving series $id ($title) <$url>: $e"; say "error: $e"; } } elsif ( $url =~ m[/works/(\d+)] ) { my $id = $1; my $path = 'works/' . $id . '.html'; print "saving work $id ($title) .. "; try { $mech->get( 'https://archiveofourown.org' . $url . '?view_full_work=true' ); my $text = process_work( $id, $mech->content() ); my $save_images = 1; if ( path($path)->exists() ) { # make sure we aren't overwriting things my $old = path($path)->slurp_utf8(); if ( $old ne $text ) { path($path)->move( 'works/' . $id . '-' . $lastdate . '.html' ); # move the existing file to have the last run date $path = 'works/' . $id . '-' . $datetime . '.html'; # save the new file with the date push @$changed, "$id ($title) changed; saved as $path"; } else { $save_images = 0; } } path($path)->spew_utf8($text); if ($save_images) { ## TODO: save images } say "done (" . scalar(@$urls) . " remaining)"; } catch ($e) { push @$errors, "Error saving $id ($title) <$url>: $e"; say "error: $e"; } } elsif ( $url =~ m[/external_works/(\d+)] ) { my $id = $1; push @$errors, "external work '$title' ($url) not saved"; } else { push @$errors, "unknown url type $url"; } sleep 5; # placate the AO3 time limits (max of 120 pages in 6 minutes) } say "done retrieving works"; if (@$changed) { say "Changed:"; print "\t" . join( "\n\t" => @$changed ) . "\n"; } if (@$errors) { say "Errors:"; print "\t" . join( "\n\t" => @$errors ) . "\n"; } # process a bookmark sub process_bookmark { my $b = shift; $b->find('div.dynamic')->map('remove'); # get rid of some small empty divs $b->descendant_nodes->grep( sub { $_->type eq 'comment' } ) # get rid of any HTML comments ->map('remove'); # remove status paragraph $b->find('p.status')->map('remove'); # our later transformation will remove some spaces that are nicer to keep # so fix this here $b->find('div.header.module h4.heading')->first( sub { $_->child_nodes()->map( sub { if ( $_->type() eq 'text' ) { my $content = $_->content(); $content =~ s/^\s*|\s*$//g; # strip external spaces if ( $content ne '' ) { $_->content( '[|space|]' . $content . '[|space|]' ); # add a placeholder } } } ); } ); # simplify required tags $b->find('div.header.module ul.required-tags li')->map( sub { $_->content( $_->all_text() ); } ); # remove other tag destinations $b->find('a.tag')->map( sub { delete $_->attr->{'href'}; } ); # remove things that are likely to change in the stats $b->find('dl.stats')->map( sub { $_->find('.collections, .comments, .kudos, .bookmarks, .hits') ->map('remove'); } ); $b->find('div.own.user.module.group')->map( sub { $_->at('h5.byline.heading')->remove(); # get rid of the fact that I bookmarked it $_->at('p.datetime')->prepend_content('Bookmarked on '); # make the purpose of the date obvious $_->at('ul.actions')->remove(); # get rid of "delete bookmark" form } ); # extract the url # NB: this presumes that the relavent url is the first one! my $loc = []; $b->find('div.header.module h4.heading a')->first( sub { $loc = [ $_->text(), $_->attr->{'href'} ]; } ); # turn it into a single line string my $text = $b->to_string(); $text =~ s/\n\s*//g; # fix some things $text =~ s(\[\|space\|\])( )g; # replace the placeholder for spaces $text =~ s[<dt>Bookmarks:</dt><dd><a href="/series/\d+/bookmarks">\d+</a></dd>][]; # series don't label the bookmarks count properly return [ $loc, $text ]; } # process a series sub process_series { my $html = shift; my $dom = Mojo::DOM58->new($html); my $urls = []; # pull out the parts we want my $title = $dom->at('head title')->all_text(); my $heading = $dom->at('h2.heading')->all_text(); my $meta_dom = Mojo::DOM58->new( $dom->at('div#main div.wrapper dl.series.meta.group') ); my $mark_dom = Mojo::DOM58->new( $dom->at('ul.series.work.index.group') ); # strip surrounding whitespace from the title $title =~ s/^\s+|\s+$//g; $heading =~ s/^\s+|\s+$//g; # clean up the meta data $meta_dom->descendant_nodes->grep( sub { $_->type eq 'comment' } ) # get rid of any HTML comments ->map('remove'); my $meta_text = $meta_dom->to_string(); $meta_text =~ s/\s*\n\s*//g; $meta_text =~ s[<dt>Bookmarks:</dt><dd><a href="/series/\d+/bookmarks">\d+</a></dd>][]; # series don't label the bookmarks count properly # process the series items my $res = []; $mark_dom->at('ul.series.work.index.group')->children('li') # run through each work in the series and treat it as a bookmark ->map( sub { push @$res, process_bookmark( Mojo::DOM58->new($_) ); } ); my $bookmarks = ''; foreach my $r (@$res) { my ( $loc, $text ) = @$r; if (@$loc) { push @$urls, $loc; } $bookmarks .= "\t" . $text . "\n"; } # generate a very basic HTML wrapper my $text = <<~EOHTML; <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" /> <title>${title}</title> <link rel="stylesheet" href="../style.css" type="text/css" /> </head> <body class="series"> <h1>${heading}</h1> <div class="meta"> ${meta_text} </div> <ul class="bookmarks"> ${bookmarks} </ul> </body> </html> EOHTML return [ $text, @$urls ]; } # process a work sub process_work { my ( $id, $html ) = @_; my $dom = Mojo::DOM58->new($html); # pull out the parts we want my $title = $dom->at('head title')->all_text(); my $meta_dom = Mojo::DOM58->new( $dom->at('div#main div.wrapper dl.work.meta.group') ); my $work_dom = Mojo::DOM58->new( $dom->at('div#workskin') ); # strip surrounding whitespace from the title $title =~ s/^\s+|\s+$//g; # clean up the meta data $meta_dom->descendant_nodes->grep( sub { $_->type eq 'comment' } ) # get rid of any HTML comments ->map('remove'); $meta_dom->find('dt.warning.tags')->map( sub { # clean up an annoying link I don't want my $t = $_->content(); $t =~ s[<a href="[^"]+">][]; $t =~ s[</a>][]; $_->content($t); } ); $meta_dom->find('a.tag')->map( sub { delete $_->attr->{'href'}; } ); # remove link destinations from tags $meta_dom->find('.collections')->map('remove'); $meta_dom->find('dl.stats')->map( sub { # get rid of things that change $_->find('.comments, .kudos, .bookmarks, .hits') ->map('remove'); } ); # clean up the actual text $work_dom->descendant_nodes->grep( sub { $_->type eq 'comment' } ) # get rid of any HTML comments ->map('remove'); $work_dom->at('div#workskin')->tap( sub { # change to 'main' tag and fix up class/id $_->tag('main'); $_->attr( 'class' => 'work' ); delete $_->attr->{'id'}; } ); $work_dom->find('h3.landmark.heading')->map('remove'); # remove an unnecessary (hidden) heading # fix the chapter endnote links, which annoyingly swap format seemingly at random $work_dom->find('div.chapter div.notes a')->each( sub { my $e = shift; if ( defined( $e->{'href'} ) and ( $e->{'href'} =~ m/(#chapter_\d+_endnotes)$/ ) ) { $e->{'href'} = $1; } } ); #stringify my $meta = $meta_dom->to_string(); my $work = $work_dom->to_string(); # generate a very basic HTML wrapper my $text = <<~EOHTML; <!DOCTYPE html> <html lang="en"> <head> <meta charset="utf-8" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" /> <title>${title}</title> <link rel="canonical" href="https://archiveofourown.org/works/$id" /> <link rel="stylesheet" href="../style.css" type="text/css" /> </head> <body class="work"> <h1>${title}</h1> <div class="meta"> ${meta} </div> ${work} </body> </html> EOHTML $text = Mojo::DOM58->new($text)->to_string(); # run it through Mojo::DOM58 to canonicalize the attribute order # (this helps with helper programs) return $text; }