-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
108 lines (94 loc) · 2.9 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
const fetch = require('node-fetch');
const fs = require('fs');
const separator = '~';
const createRowObject = require('./parser');
console.log('Node > v8.0.0 required');
console.log('------------------------');
let startIndex = 91800;
const maxResults = 9000;
const currentFile = new Date().toJSON()+'.csv';
const buildURL = index => {
return 'http://directorio.directoriolegislativo.org/node/' + index;
}
const URLGenerator = function* (){
for(i = startIndex; i < startIndex + maxResults; i++ ){
yield { url: buildURL(i), index: i};
}
}
const fetchHtml = url => {
return fetch(url).then( res => res.text() );
}
const readCache = url => {
return new Promise((resolve, reject)=>{
fs.readFile(`./cache/${index}`, (err, html)=>{
if (err) resolve(false);
resolve(html);
});
})
}
const writeCache = (index, html) => {
return new Promise((resolve, reject)=>{
fs.writeFile(`./cache/${index}`, html, { flag: 'a' }, err=>{
if (err) reject(err);
resolve(html);
});
})
}
const getHTML = async (index, url) => {
const cachedHtml = await readCache(index);
if( cachedHtml ) {
console.log(`${index} : Resolved from Cache`);
return cachedHtml;
}
const fetchedHtml = await fetchHtml(url);
console.log(`${index} : Resolved from Network`);
await writeCache(index, fetchedHtml);
return fetchedHtml;
}
const arrayPad = (rowObject, keysMaxLengths) =>{
Object.keys(rowObject)
.filter( key => Array.isArray(rowObject[key]))
.forEach( key => {
for(let i = rowObject[key].length; i < keysMaxLengths[key]; i++){
rowObject[key].push('-');
}
})
return rowObject;
}
const formatRow = (rowObject) => {
return Object.keys(rowObject)
.reduce( (prev, currentKey) =>
prev +
(Array.isArray(rowObject[currentKey])
? rowObject[currentKey].join(separator)
: rowObject[currentKey])
+ separator
, "")
.replace(new RegExp(`${separator + separator}`, 'g'), separator + '-' + separator) + '\n';
}
const writeRow = (row) => {
return new Promise((resolve, reject)=>{
fs.writeFile(currentFile, row, { flag: 'a' }, err=>{
if (err) reject(err);
resolve(row);
});
})
}
const writeOutput = async(rowObjects, keysMaxLengths) => {
for (let rowObject of rowObjects){
let row = await writeRow(formatRow(arrayPad(rowObject, keysMaxLengths)));
}
}
async function run(){
const rowObjects = [];
const keysMaxLengths = [];
for ({url, index} of URLGenerator()){
let rowObject = createRowObject(await getHTML(index, url), url);
if(rowObject) {
Object.keys(rowObject).filter(k => Array.isArray(rowObject[k])).forEach( k => keysMaxLengths[k] = Math.max(rowObject[k].length, keysMaxLengths[k] || 0))
rowObjects.push(rowObject);
}
}
await writeOutput(rowObjects, keysMaxLengths);
}
run();