-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgetArticleURLs.js
60 lines (54 loc) · 2.32 KB
/
getArticleURLs.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import makeRequest from './makeRequest.js';
import { randDelay } from './delay.js';
import { domain, language, smallDelay, mediumDelay } from './globals.js'
export default async function getArticleURLs(userId, years = []) {
try {
let pageStart = 0;
// 100 is the maximum google scholar will send for each request.
let pageSize = 100;
let articleURLs = [];
// Had to add this in order to check article repetition between users.
let articleHashes = [];
let articlesLeft;
do {
// https://scholar.google.com/citations?hl=es&user=DGmfF8QAAAAJ&view_op=list_works&sortby=pubdate
let requestUrl = `${domain}/citations?user=${userId}&hl=${language}&oi=ao&cstart=${pageStart}&pagesize=${pageSize}`;
pageStart += pageSize;
const document = await makeRequest(requestUrl);
let articleNodes = document.querySelectorAll('form table tbody')[0].querySelectorAll('tr');
// The above selector finds a "There are no articles in this profile." node when there are no
// articles left, so articleNodes will never be empty. So I added a querySelector('a') to account
// for that.
articlesLeft = Boolean(articleNodes[0].querySelector('a'));
if (articlesLeft) {
articleNodes.forEach(ele => {
let year = parseInt(ele.children[2].textContent);
if (years.length === 0 || years.includes(year)) {
articleURLs.push(domain + '/' + ele.children[0].querySelector('a').href);
let name = ele.children[0].children[0].textContent;
let authors = ele.children[0].children[1].textContent;
let totalCitations = ele.children[1].textContent;
let year = ele.children[2].textContent;
articleHashes.push(generateHash(name + authors + totalCitations + year));
}
});
}
await randDelay(...smallDelay);
} while (articlesLeft);
return Promise.resolve({ articleURLs: articleURLs, articleHashes: articleHashes });
} catch (error) {
console.log('At getArticleURLs.');
return Promise.reject(error);
}
}
// Polynomial rolling hash function.
function generateHash(string) {
let hash = 0;
let p = 61, m = 10 ** 9 + 9;
let p_pow = 1;
for (let i = 0; i < string.length; i++) {
hash += string[i].charCodeAt() * p_pow % m;
p_pow = (p_pow * p) % m;
}
return hash;
}