Updated 2018-08-31 18:38:21 by kpv

Keith Vetter 2018-08-31 - This package provides a scripted interface to some of the data on the fan-fiction website AO3 (Archive of Our Own).

I've been wanting this data for a better reading experience while reading the stories on AO3, such as having all parts of the story together, being available while off-line and a better book reading interface than a web browser. I wanted to gather all the parts and create an epub out of them, and then use a book reading app.

An official API for AO3 data has been on the roadmap for years but it's not out yet. There's a python package, ao3, that provides an interface using BeautifulSoup to scrape the web pages.

So I decided to create my own AO3 scripting interface. Fortunately, the AO3's story web pages have a very consistent format making web scraping a fragile, but viable option. It uses tdom and its xpath interface to extract data from the AO3 web pages.

Documentation is provided in the package.

see also:

  • EpubCreator -- tool to create an epub from html pages
  • ao3ToEpub -- script that uses this package to create epubs from AO3 stories

namespace eval ::AO3 {
    # This package provides a scripted interface to the stories on AO3 (Archive of Our Own).
    # This is NOT an official API but rather scrapes the web site for the data.
    # It is inspired by the python ao3 package at https://pypi.org/project/ao3/.
    # by Keith Vetter 2018-08-29
    #
    # Sample usage:
    #   set ao3 [::AO3::New 258626]
    #   puts "Title: [$ao3 title]"
    #   puts "Author: [$ao3 author]"
    #   puts "Words: [$ao3 words]"
    #   set storyHtml [$aos story]
    #   $ao3 cleanup
    #
    # API Documentation
    # =================
    #   set aos [::AO3::New $story_id]
    #      Creates interface object for parsing Archive of Our Own stories
    #
    #   $ao3 cleanup
    #      Frees all the resources associated with this AO3 object
    #
    #   $ao3 title
    #      Returns the title of the story
    #
    #   $ao3 author
    #      Returns the author of the story
    #
    #   $ao3 summary
    #      Returns an html summary of the story
    #
    #   $ao3 story
    #      Returns the html of the story
    #
    #   $ao3 chapter ## html|summary|count
    #      For multi-chapter stories, extract html or summary for a specified chapter
    #
    #   $ao3 additional_tags
    #      Returns a list of additional tags for the story
    #
    #   $ao3 bookmarks
    #      Returns a count of the number of bookmarks for the story
    #
    #   $ao3 category
    #      Returns a list of categories for the story
    #
    #   $ao3 chapters
    #      Returns how many chapters written and planned in the story, e.g. 5/15
    #
    #   $ao3 characters
    #      Returns a list of characters in the story
    #
    #   $ao3 comments
    #      Returns a count of the number of comments for the story
    #
    #   $ao3 fandoms
    #      Returns a list of the fandoms this story is in
    #
    #   $ao3 hits
    #      Returns a count of the number of hits for this story
    #
    #   $ao3 kudos
    #      Returns a count of the number of kudos for this story
    #
    #   $ao3 kudos_left_by
    #      Returns a list of all users who left kudos for this story
    #
    #   $ao3 language
    #      Returns the language the story is written in
    #
    #   $ao3 published
    #      Returns the date the story was published
    #
    #   $ao3 rating
    #      Returns a list of the ratings for this story
    #
    #   $ao3 relationships
    #      Returns a list of relationships in this story
    #
    #   $ao3 warnings
    #      Returns a list of warnings for this story
    #
    #   $ao3 words
    #      Returns a count of words in this story
    #
    #   $ao3 html
    #      Returns the raw html for the story
    #
    #   $ao3 id
    #      Returns this story's id
    #
    #   $ao3 json
    #      Returns most of the metadata about this story wrapped in a json object
    #
    #   $ao3 url
    #      Returns the url to this story's page on Archive of Our Own

    package require tdom
    package require http
    package require tls
    http::register https 443 [list ::tls::socket -tls1 1]
    variable assertions off

    proc New {id {verbose 0} {rawHtml ""}} {
        # Creates a new instance of our AO3 object for the requested story

        set me [_uniqueName]
        set rawHtml [_getHtml $id $verbose $rawHtml]
        variable $me [dict create html $rawHtml]
        set dom [::dom parse -html $rawHtml]

        set this [dict create id $id dom $dom me $me verbose $verbose]
        set commandMap [_buildCommandMap $this]
        namespace ensemble create -command $me -map $commandMap
        return $me
    }

    # metadata stats that all have similar format in the file
    set properties {
        rating category {fandoms fandom} {relationships relationship} {characters character}
        {additional_tags freeform} language words comments kudos hits published bookmarks
        chapters
    }
    foreach property $properties {
        lassign [concat $property $property] func keyword
        set body "return \[_lookupStat \$this $keyword\]"
        proc [namespace current]::$func {this} $body
    }

    proc _LOG {this level message} {
        set lvl [lsearch -exact {ALWAYS INFO DEBUG} $level]
        if {$lvl == -1 || $lvl > [dict get $this verbose]} return
        puts stderr "[string index $level 0]: $message"
    }
    proc _getHtml {id verbose rawHtml} {
        # Either download the story's html or read it from file or cache

        if {$rawHtml eq "cache"} {
            set rawHtml ""
            if {[file exists "$id.html"]} { set rawHtml "$id.html" }
        }

        if {$rawHtml eq ""} {
            set rawHtml [_downloadStory [dict create id $id verbose $verbose]]
        } elseif {[file exists $rawHtml]} {
            _LOG [dict create verbose $verbose] INFO "reading html from file $rawHtml"
            set rawHtml [::tDOM::xmlReadFile $rawHtml]
        }
        if {[string first "</" $rawHtml] == -1} {
            error "ERROR: looks like bad html: '[string range $rawHtml 0 50]...'"
        }

        return $rawHtml
    }
    proc _downloadStory {this} {
        # Downloads html from AO3

        set id [dict get $this id]
        set url "https://archiveofourown.org/works/$id?view_full_work=true&view_adult=true"
        _LOG $this INFO "downloading $url"
        set token [::http::geturl $url]
        set ncode [::http::ncode $token]
        set html [::http::data $token]
        _LOG $this DEBUG "download done: $ncode [string length $html] bytes"
        ::http::cleanup $token
        if {$ncode != 200} {
            error "ERROR: download failed: $ncode url: $url"
        }
        return $html
    }
    proc _uniqueName {} {
        # Find an unused name for our new namespace ensemble
        set existing [info commands [namespace current]::_obj*]
        for {set cnt [llength $existing]} {1} {incr cnt} {
            set me "[namespace current]::_obj$cnt"
            if {$me ni $existing} break
        }
        return $me
    }
    proc _buildCommandMap {this} {
        # Creates ensemble mapping from command to function
        set commandMap {}
        set cmds [lmap cmd [info commands [namespace current]::*] {namespace tail $cmd}]
        foreach cmd $cmds {
            if {$cmd eq "New" || [string match "_*" $cmd]} continue
            lappend commandMap $cmd [list $cmd $this]
        }
        return $commandMap
    }

    proc _assert {script expected {emsg ""}} {
        # Simple assertion mechanism with lazy evaluation

        if {$::AO3::assertions ne "on"} return
        set actual [uplevel 1 $script]
        if {$actual == $expected} return
        if {$emsg eq ""} { set emsg "$actual != $expected"}
        error $emsg
    }

    proc _FindAllInDom {this tag attribute value} {
        # Uses xpath to search the dom for tag/attribute/value triplet

        # If attribute is "id" we do an exact match, otherwise use contains()
        set dom [dict get $this dom]
        if {$attribute eq "id"} {
            set xpath "//$tag\[@$attribute='$value'\]"
        } else {
            set xpath "//$tag\[contains(@$attribute,'$value')\]"
        }
        _LOG $this DEBUG "xpath: $xpath"
        set nodes [$dom selectNodes $xpath]
        return $nodes
    }
    proc _innerHtml {html} {
        # Peels off the outer most tag from the html
        regsub {^.*?>\s*} $html "" html
        regsub {^(.*)\s*</.*>\s*} $html {\1} html
        return $html
    }
}
proc ::AO3::id {this} { return [dict get $this id] }
proc ::AO3::url {this} {return "https://archiveofourown.org/works/[dict get $this id]"}
proc ::AO3::this {this} {return $this}
proc ::AO3::html {this} {return [dict get [set [dict get $this me]] html]}
proc ::AO3::cleanup {this} {
    unset -nocomplain [dict get $this me]   ;# Delete the raw html
    [dict get $this dom] delete             ;# Delete the dom
    rename [dict get $this me] {}           ;# Delete the ensemble object
}
proc ::AO3::save {this fname} {
    _LOG $this INFO "saving html to $fname"
    set fout [open $fname w]
    puts -nonewline $fout [::AO3::html $this]
    close $fout
}

proc ::AO3::title {this} {
    # The title of the work is stored in an <h2> tag of the form
    #
    #     <h2 class="title heading">[title]</h2>
    #
    set titleNodes [_FindAllInDom $this h2 class title]
    _assert {llength $titleNodes} 1 "wrong number of title nodes"
    set title [[lindex $titleNodes 0] asText]
    set title [string trim $title]
    return $title
}
proc ::AO3::author {this} {
    # The author of the work is kept in the byline, in the form
    #
    #     <h3 class="byline heading">
    #       <a href="/users/[author_name]" rel="author">[author_name]</a>
    #     </h3>
    #
    set authorNodes [_FindAllInDom $this h3 class byline]
    _assert {llength $authorNodes} 1 "wrong number of author nodes"
    set author [[lindex $authorNodes 0] asText]
    set author [string trim $author]
    return $author
}

proc ::AO3::story {this} {
    # The article (story) is kept in a <div> tag of the form
    #
    #     <div id="chapters" role="article">...</div>
    #
    set storyNode [_FindAllInDom $this div id chapters]
    set storyHtml [$storyNode asHTML]
    set storyHtml [_innerHtml $storyHtml]
    return $storyHtml
}
proc ::AO3::chapter {this chapterNumber {subcommand html}} {
    # Each chapter is kept in a <div> tag of the form
    #
    #     <div class="chapter" id="chapter-3">...</div>
    #
    # Note: not all stories are broken into chapters--in those cases we return ""
    #
    if {$subcommand ni {html summary count}} {
        set emsg "ERROR: unknown subcommand: '$subcommand'. "
        append emsg "Must be one of 'html', 'summary' or 'count'"
        error $emsg
    }

    if {$subcommand eq "count"} {
        set xpath {//div[contains(@id,'chapter-')]}
        set all [[dict get $this dom] selectNodes $xpath]
        return [llength $all]
    }

    set id "chapter-$chapterNumber"
    set chapterNodes [_FindAllInDom $this div id $id]
    if {$chapterNodes eq ""} {
        if {$chapterNumber == 1} {
            if {$subcommand eq "html"} {
                return [::AO3::story $this]
            }
            if {$subcommand eq "summary"} {
                return [::AO3::summary $this]
            }
        }
        return ""
    }

    _assert {llength $chapterNodes} 1
    set chapterHtml [[lindex $chapterNodes 0] asHTML]
    # NB. don't call _innerHtml because the outer <div> has useful id attribute

    if {$subcommand eq "html"} {
        return $chapterHtml
    }
    if {$subcommand eq "summary"} {
        # Put chapter html into a separate dom tree for easier parsing
        set cdom [::dom parse -html $chapterHtml]
        set this2 [dict create dom $cdom verbose [dict get $this verbose]]
        set summaryNodes [_FindAllInDom $this2 div id summary]
        $cdom delete
        if {$summaryNodes eq ""} {return ""}
        _assert {llength $summaryNodes} 1
        set summary [[lindex $summaryNodes 0] asText]
        return $summary
    }
}

proc ::AO3::summary {this} {
    # The author summary is kept in the following format:
    #
    #     <div class="summary module" role="complementary">
    #       <h3 class="heading">summary:</h3>
    #       <blockquote class="userstuff">
    #         [author_summary_html]
    #       </blockquote>
    #     </div>
    #
    # NB. chapter summaries can be fetched via the 'chapter # summary' command
    #
    set dom [dict get $this dom]
    set xpath {//div[contains(@class,'summary')]/blockquote[@class='userstuff']}
    set summaryNodes [$dom selectNodes $xpath]
    set summaryNode [lindex $summaryNodes 0]
    set summaryHtml [$summaryNode asHTML]
    set summaryHtml [_innerHtml $summaryHtml]
    return $summaryHtml
}
proc ::AO3::_lookupStat {this which} {
    # A statistics are stored in the form
    #
    #     <dd class="$which">####</dd>
    #
    #    --or--
    #
    #     <dd class="$which tags">
    #       <ul class="commas">
    #         <li><a href="/further-works">[value 1]</a></li>
    #         <li><a href="/more-info">[value 2]</a></li>
    #         <li class="last"><a href="/more-works">[value 3]</a></li>
    #       </ul>
    #     </dd>
    #
    # We want to get the data from the individual <li> elements.
    #
    set result {}
    set statNode [_FindAllInDom $this dd class $which]
    if {$statNode eq ""} { return "" }
    if {[[$statNode firstChild] nodeName] eq "ul"} {
        foreach node [[$statNode firstChild] childNodes] {
            _assert {$node nodeName} li
            lappend result [string trim [[$node firstChild] asText]]
        }
    } else {
        lappend result [string trim [$statNode asText]]
    }
    return $result
}

proc ::AO3::warnings {this} {
    # Like other stats except we want to tweak the result
    set result [_lookupStat $this warning]
    if {[lindex $result 0] eq "No Archive Warnings Apply"} {
        lset result 0 ""
    }
    return $result
}

proc ::AO3::kudos_left_by {this} {
    # The list of usernames who left kudos is stored in the following
    # format:
    #
    #     <div id="kudos">
    #       <p class="kudos">
    #         <a href="/users/[username1]">[username1]</a>
    #         <a href="/users/[username2]">[username2]</a>
    #         ...
    #       </p>
    #     </div>
    #
    # And yes, this really does include every username. The fic with the
    # most kudos is http://archiveofourown.org/works/2080878, and this
    # approach successfully retrieved the username of everybody who
    # left kudos.

    # set kudosNode [_FindAllInDom $this div id kudos]
    set result {}
    foreach knode [[dict get $this dom] selectNodes {//div[@id='kudos']//a}] {
        # Skip <a> tags used for hiding portions of very longs kudos lists
        if {[$knode getAttribute id ""] in {kudos_collapser kudos_summary}} continue
        lappend result [$knode asText]
    }
    return $result
}
proc ::AO3::json {this} {
    # Packages up most of the metadata about a story into a json object
    set keys {{id value} {title value} {author value} {summary value}
        {warnings list} {rating list} {category list} {fandoms list}
        {relationships list} {characters list} {additional_tags list}
        {language value}
        {stats sublist}
        {published value} {words value} {chapters value}
        {comments value} {kudos value} {bookmarks value} {hits value}
        {stats endlist}
    }

    set me [dict get $this me]
    set json "{"
    set comma ""
    set indent "  "
    foreach keyInfo $keys {
        _LOG $this DEBUG "json for $keyInfo"
        lassign $keyInfo key type
        if {$type eq "skip"} continue
        if {$type eq "endlist"} {
            set indent [string range $indent 0 end-2]
            append json "\n$indent\}"
            continue
        }
        if {$type eq "sublist"} {
            append json "$comma\n$indent\"$key\": \{"
            set comma ""
            append indent "  "
            continue
        }
        set valu22e [$me $key]
        append json "$comma\n$indent\"$key\": [::AO3::_toJson $value $type]"
        set comma ","
    }
    append json "\n}"
    return $json
}
proc ::AO3::_toJson {value type} {
    # Helper function to convert numbers, strings or lists of values into proper json
    if {$type eq "value"} {
        if {! [string is double -strict $value]} {
            set value [string map {\x22 \\\x22 \n " "} $value]
            set value "\"$value\""
        }
        return $value
    }
    # Handle list of values
    set result {}
    foreach item $value {
        lappend result [::AO3::_toJson $item value]
    }
    return "\[[join $result {, }]\]"
}

# Here's some quick demo code

set id 258626
set a [::AO3::New $id]

puts "id              : [$a id]"
puts "title           : [$a title]"
puts "author          : [$a author]"
puts "summary         : [string range [$a summary] 0 50]..."
puts "rating          : [$a rating]"
puts "warnings        : [$a warnings]"
puts "category        : [$a category]"
puts "fandoms         : [$a fandoms]"
puts "relationships   : [string range [$a relationships] 0 50]..."
puts "characters      : [string range [$a characters] 0 50]..."
puts "additional_tags : [$a additional_tags]"
puts "language        : [$a language]"
puts "published       : [$a published]"
puts "words           : [$a words]"
puts "comments        : [$a comments]"
puts "chapters        : [$a chapters]"
puts "kudos           : [$a kudos]"
puts "bookmarks       : [$a bookmarks]"
puts "hits            : [$a hits]"