Skip to content

Commit

Permalink
rewrite to use css selectors
Browse files Browse the repository at this point in the history
  • Loading branch information
legowerewolf committed Apr 7, 2024
1 parent e29bf69 commit b4ca40a
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 75 deletions.
3 changes: 2 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
module github.com/legowerewolf/AO3fetch

go 1.20
go 1.21

require (
github.com/andybalholm/cascadia v1.3.2
github.com/cheggaaa/pb/v3 v3.1.5
github.com/deckarep/golang-set/v2 v2.6.0
github.com/gammazero/deque v0.2.1
Expand Down
36 changes: 36 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow=
github.com/VividCortex/ewma v1.2.0/go.mod h1:nz4BbCtbLyFDeC9SUHbtcT5644juEuWfUAUnGx7j5l4=
github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss=
github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU=
github.com/cheggaaa/pb/v3 v3.1.5 h1:QuuUzeM2WsAqG2gMqtzaWithDJv0i+i6UlnwSCI4QLk=
github.com/cheggaaa/pb/v3 v3.1.5/go.mod h1:CrxkeghYTXi1lQBEI7jSn+3svI3cuc19haAj6jM60XI=
github.com/deckarep/golang-set/v2 v2.6.0 h1:XfcQbWM1LlMB8BsJ8N9vW5ehnnPVIw0je80NsVHagjM=
Expand All @@ -18,9 +20,43 @@ github.com/mattn/go-runewidth v0.0.15/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.3 h1:utMvzDsuh3suAEnhH0RdHmoPbU648o6CvXxTx4SBMOw=
github.com/rivo/uniseg v0.4.3/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c=
golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs=
golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
golang.org/x/net v0.24.0 h1:1PcaxkF854Fu3+lvBIx5SYn9wRlBzzcnHZSiaFFAb0w=
golang.org/x/net v0.24.0/go.mod h1:2Q7sJY5mzlzWjKtYUEXSlBWCdyaioyXzRB2RtU8KVE8=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o=
golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=
golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8=
golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc=
golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
172 changes: 98 additions & 74 deletions scraper.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"strings"
"time"

"github.com/andybalholm/cascadia"
"github.com/cheggaaa/pb/v3"
mapset "github.com/deckarep/golang-set/v2"
"github.com/gammazero/deque"
Expand All @@ -25,10 +26,17 @@ import (
// global variables
var (
isWorkMatcher, isSeriesMatcher, isSpecialMatcher *regexp.Regexp
paginationSelector *cascadia.Selector
client *ao3client.Ao3Client
)

func main() {
// compile regexes
isWorkMatcher = regexp.MustCompile(`/works/\d+`)
isSeriesMatcher = regexp.MustCompile(`/series/\d+`)
isSpecialMatcher = regexp.MustCompile(`bookmarks|comments|collections|search|tags|users|transformative|chapters|kudos|navigate|share|view_full_work`)
*paginationSelector = cascadia.MustCompile(".pagination a")

// parse flags
var (
seedURLRaw, credentials, outputFile string
Expand Down Expand Up @@ -124,37 +132,14 @@ func main() {
}
defer resp.Body.Close()

highest := 0

tokenizer := html.NewTokenizer(resp.Body)
for tt := tokenizer.Next(); tt != html.ErrorToken; tt = tokenizer.Next() {
token := tokenizer.Token()

if !(token.Type == html.StartTagToken && token.Data == "a") {
continue
}

href, err := getHref(token)
if err != nil {
continue
}

uhref, err := url.Parse(href)
if err != nil {
continue
}

query := uhref.Query()
if query.Has("page") {
page, err := strconv.Atoi(query.Get("page"))
if err != nil {
continue
}
document, err := html.Parse(resp.Body)
if err != nil {
log.Fatal(err)
}

if page > highest {
highest = page
}
}
highest, err := getHighestPage(document)
if err != nil {
log.Fatal(err)
}

pages = highest - startPage + 1
Expand All @@ -165,18 +150,13 @@ func main() {

// parameters all check out, finish initializing

// compile regexes
isWorkMatcher = regexp.MustCompile(`/works/\d+`)
isSeriesMatcher = regexp.MustCompile(`/series/\d+`)
isSpecialMatcher = regexp.MustCompile(`bookmarks|comments|collections|search|tags|users|transformative|chapters|kudos|navigate|share|view_full_work`)

// make the coordination channels, queue, and sets
returnedWorks := make(chan string) // relays detected work URLs back to coordinator
returnedSeries := make(chan string) // ditto for series
finished := make(chan int) // tells coordinator when a crawl is finished
queue := deque.New[string](pages) // stores URLs to be crawled
workSet := mapset.NewSet[string]() // stores URLs of works that have been detected
seriesSet := mapset.NewSet[string]() // ditto for series
returnedWorks := make(chan string) // relays detected work URLs back to coordinator
returnedSeries := make(chan string) // ditto for series
finished := make(chan int) // tells coordinator when a crawl is finished
queue := deque.New[string](pages) // stores URLs to be crawled
queuedPagesSet := mapset.NewSet[string]() // stores URLs that have been added to the queue
workSet := mapset.NewSet[string]() // stores URLs of works that have been detected

// initialization done, start scraping

Expand All @@ -187,17 +167,14 @@ func main() {
fmt.Println("Delay: ", delay)

// populate queue
query := seedURL.Query()
for addlPage := 0; addlPage < pages; addlPage++ {
query.Set("page", strconv.Itoa(startPage+addlPage))
seedURL.RawQuery = query.Encode()

queue.PushBack(seedURL.String())
for _, page := range generatePageList(seedURL, startPage, startPage+pages-1) {
dedupedEnque(queue, queuedPagesSet, page)
}

// set up and start progress bar
bar := pb.New(pages)
bar.SetTemplateString(`{{counters .}} {{bar . " " ("█" | green) ("█" | green) ("█" | white) " "}} {{percent .}}`)
bar.SetTotal(int64(queuedPagesSet.Cardinality()))
if showProgress {
bar.Start()
}
Expand All @@ -215,13 +192,8 @@ func main() {
continue
}

if seriesSet.Contains(series) {
continue
}

seriesSet.Add(series)
queue.PushBack(series)
bar.SetTotal(int64(pages + seriesSet.Cardinality()))
dedupedEnque(queue, queuedPagesSet, series)
bar.SetTotal(int64(queuedPagesSet.Cardinality()))
case waitTime := <-finished: // exit coordinator loop when crawl is finished
if waitTime >= 0 { // waitTime >= 0 means we should try again later, so rotate the queue
queue.Rotate(1)
Expand All @@ -245,7 +217,7 @@ func main() {

bar.Finish()

log.Printf("Found %d works across %d pages. \n", workSet.Cardinality(), pages+seriesSet.Cardinality())
log.Printf("Found %d works across %d pages. \n", workSet.Cardinality(), pages+queuedPagesSet.Cardinality())
fmt.Println()

var workOutputTarget io.Writer
Expand Down Expand Up @@ -312,15 +284,16 @@ func crawl(crawlUrl string, returnedWorks, returnedSeries chan string, finished

crawledPageIsSeries := isSeriesMatcher.MatchString(crawlUrl)

tokenizer := html.NewTokenizer(resp.Body)
for tt := tokenizer.Next(); tt != html.ErrorToken; tt = tokenizer.Next() {
token := tokenizer.Token()
document, err := html.Parse(resp.Body)
if err != nil {
panic("failed to parse")
}

if !(token.Type == html.StartTagToken && token.Data == "a") {
continue
}
nodeList := cascadia.QueryAll(document, cascadia.MustCompile("a"))

href, err := getHref(token)
for _, node := range nodeList {

href, err := getAttr(node.Attr, "href")
if err != nil {
continue
}
Expand All @@ -336,27 +309,78 @@ func crawl(crawlUrl string, returnedWorks, returnedSeries chan string, finished
}

if crawledPageIsSeries {
for _, attr := range token.Attr {
if attr.Key != "rel" {
continue
}
highestPage, err := getHighestPage(document)
if err != nil {
continue
}

if attr.Val == "next" {
returnedSeries <- toFullURL(href)
break
}
u_crawledUrl, err := url.Parse(crawlUrl)
if err != nil {
continue
}

for _, page := range generatePageList(u_crawledUrl, 1, highestPage) {
returnedSeries <- page
}
}
}
}

func getHref(t html.Token) (string, error) {
for _, a := range t.Attr {
if a.Key == "href" {
func getAttr(attrList []html.Attribute, targetAttr string) (string, error) {
for _, a := range attrList {
if a.Key == targetAttr {
return a.Val, nil
}
}
return "", errors.New("no href attribute found")
return "", errors.New("target attribute not found")
}

func getHighestPage(document *html.Node) (int, error) {

paginationLinks := cascadia.QueryAll(document, paginationSelector)

highest := -1

for _, link := range paginationLinks {

href, err := getAttr(link.Attr, "href")
if err != nil {
continue
}

url_, err := url.Parse(href)
if err != nil {
continue
}

if pgnum := url_.Query().Get("page"); pgnum != "" {
pgnum_p, err := strconv.Atoi(pgnum)
if err != nil {
continue
}

highest = max(highest, pgnum_p)
}
}

return highest, nil
}

func dedupedEnque[T comparable](queue *deque.Deque[T], checkSet mapset.Set[T], item T) {
if checkSet.Add(item) {
queue.PushBack(item)
}
}

func generatePageList(seedURL *url.URL, lowest, highest int) (result []string) {
query := seedURL.Query()
for addlPage := lowest; addlPage <= highest; addlPage++ {
query.Set("page", strconv.Itoa(addlPage))
seedURL.RawQuery = query.Encode()

result = append(result, seedURL.String())
}
return
}

func toFullURL(url_ string) string {
Expand Down

0 comments on commit b4ca40a

Please sign in to comment.