Skip to content

Commit a1b80b8

Browse files
added data scraper code
1 parent 4e4be13 commit a1b80b8

11 files changed

+930
-0
lines changed

bin/update_data.sh

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
#!/usr/bin/env bash
2+
3+
SCRIPT_DIR=$(dirname "$(greadlink -f "${BASH_SOURCE[0]}")")
4+
ROOT_DIR=$(dirname "$SCRIPT_DIR")
5+
SRC_DIR="${ROOT_DIR}/src"
6+
DATA_DIR="${SRC_DIR}/data"
7+
8+
function echo_and_eval() {
9+
echo "$@"
10+
eval "$@"
11+
}
12+
13+
# get google spreadsheet csv files
14+
echo_and_eval "(cd ${SRC_DIR} && ts-node get_csv_files.ts)"
15+
16+
# normalize csv files
17+
for x in "${DATA_DIR}"/*.csv; do
18+
echo_and_eval "python3 ${ROOT_DIR}/py/normalize_csv_files.py ${x} \
19+
> ${x/.csv/.yomi.csv}"
20+
echo_and_eval "mv ${x/.csv/.yomi.csv} ${x}"
21+
#echo_and_eval "open ${x}"
22+
done
23+
24+
# aggregate area hotlines
25+
HOTLINES="${DATA_DIR}/hokkaido.csv ${DATA_DIR}/tohoku.csv \
26+
${DATA_DIR}/kanto.csv ${DATA_DIR}/chubu.csv ${DATA_DIR}/kansai.csv \
27+
${DATA_DIR}/chugoku.csv ${DATA_DIR}/shikoku.csv ${DATA_DIR}/kyushu_okinawa.csv"
28+
echo_and_eval "python3 ${ROOT_DIR}/py/concat_csv_files.py ${HOTLINES} \
29+
> ${DATA_DIR}/all.csv"
30+
31+
# convert csv files to json
32+
echo_and_eval "(cd ${SRC_DIR} && ts-node make_json_files.ts)"

py/concat_csv_files.py

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import csv
2+
import sys
3+
4+
5+
def main():
6+
csv_files = sys.argv[1:]
7+
for i, csv_file in enumerate(csv_files):
8+
with open(csv_file) as f:
9+
reader = csv.DictReader(f)
10+
header = reader.fieldnames
11+
writer = csv.DictWriter(sys.stdout, fieldnames=header)
12+
if i == 0:
13+
writer.writeheader()
14+
for row in reader:
15+
writer.writerow(row)
16+
17+
18+
if __name__ == '__main__':
19+
main()

py/list_prefs.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import json
2+
import sys
3+
4+
PREF = [ "北海道", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県", "愛知県", "三重県", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "徳島県", "香川県", "愛媛県", "高知県", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県" ]
5+
6+
7+
def main():
8+
json_file = sys.argv[1]
9+
with open(json_file) as f:
10+
json_obj = json.load(f)
11+
areas = json_obj['area']
12+
for a in areas:
13+
print('\t'.join([a['name_jp'], a['url']]))
14+
15+
16+
if __name__ == '__main__':
17+
main()

py/normalize_csv_files.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import csv
2+
import re
3+
import sys
4+
5+
import MeCab
6+
import jaconv
7+
import romkan
8+
9+
tagger = MeCab.Tagger('-Oyomi')
10+
tokenizer = MeCab.Tagger('-Owakati')
11+
12+
13+
def get_yomi_str(line: str) -> str:
14+
tokenized = tokenizer.parse(line)
15+
yomi = tagger.parse(tokenized).strip()
16+
romaji = romkan.to_hepburn(yomi)
17+
toks = [x.capitalize()
18+
for x in romaji.split(' ')]
19+
return ' '.join(toks)
20+
21+
22+
def get_yomi(row):
23+
try:
24+
center = row['センター名'].strip()
25+
yomi = row['Center Name'].strip()
26+
return yomi if yomi else get_yomi_str(center)
27+
except KeyError:
28+
return ''
29+
30+
31+
def update_row(row):
32+
return {k: get_yomi(row) if k == 'Center Name' else v
33+
for k, v in row.items()}
34+
35+
36+
dash_re = re.compile(r'(?<![ぁ-んァ-ン])ー')
37+
ws_re = re.compile(r' +')
38+
39+
40+
def normalize_line(line):
41+
half = jaconv.z2h(
42+
jaconv.normalize(line),
43+
kana=False, digit=True, ascii=True
44+
)
45+
stripped = half.strip()
46+
dashed = dash_re.sub('-', stripped)
47+
return ws_re.sub(' ', dashed)
48+
49+
50+
def normalize_phone(line):
51+
return (
52+
normalize_line(line)
53+
.replace('(', '-')
54+
.replace(')', '-')
55+
)
56+
57+
58+
phone_re = re.compile(r'(phone|電話)')
59+
60+
61+
def normalize(line, field=''):
62+
if phone_re.match(field.lower()):
63+
return normalize_phone(line)
64+
else:
65+
return normalize_line(line)
66+
67+
68+
def normalize_row(row):
69+
return {k: normalize(v)
70+
for k, v in row.items()}
71+
72+
73+
def main():
74+
csv_files = sys.argv[1:]
75+
for i, csv_file in enumerate(csv_files):
76+
with open(csv_file) as f:
77+
reader = csv.DictReader(f)
78+
if ('センター名' in reader.fieldnames and
79+
'Center Name' not in reader.fieldnames):
80+
header = reader.fieldnames + ['Center Name', ]
81+
else:
82+
header = reader.fieldnames
83+
writer = csv.DictWriter(sys.stdout, fieldnames=header)
84+
if i == 0:
85+
writer.writeheader()
86+
for row in reader:
87+
updated = update_row(row)
88+
normalized = normalize_row(updated)
89+
writer.writerow(normalized)
90+
91+
92+
if __name__ == '__main__':
93+
main()

py/requirements.txt

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
MeCab
2+
bs4
3+
genmonads
4+
html_table_extractor
5+
jaconv
6+
requests
7+
romkan

py/scrape_hokenjo.py

+77
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
import csv
2+
import requests
3+
import sys
4+
from typing import Tuple, List
5+
6+
from bs4 import BeautifulSoup
7+
from genmonads.iterator import miter
8+
from genmonads.option import option
9+
from genmonads.mtry import mtry
10+
from genmonads.syntax import mfor
11+
from html_table_extractor.extractor import Extractor
12+
13+
14+
def table2lists(table):
15+
extractor = Extractor(table)
16+
extractor.parse()
17+
return extractor.return_list()
18+
19+
20+
def get_all_tables(url):
21+
html = requests.get(url).content
22+
soup = BeautifulSoup(html, 'html.parser')
23+
return soup.find_all('table')
24+
25+
26+
def scrape_table_text(url):
27+
for table in get_all_tables(url):
28+
for row in table2lists(table):
29+
yield row
30+
31+
32+
def scrape_prefecture_url(li):
33+
prefecture = li.string
34+
url = option(li.find('a')).get_or_else('')
35+
return prefecture, url
36+
37+
38+
# noinspection PyUnresolvedReferences
39+
def scrape_hokenjo_table(url: str) -> List[Tuple[str, str, str]]:
40+
return mfor(
41+
(region, prefecture, url)
42+
for table in miter(get_all_tables(url))
43+
for tr in miter(table.find_all('tr'))
44+
for region in mtry(lambda: tr.find('th').string).to_miter()
45+
for td in option(tr.find('td')).to_miter()
46+
for li in miter(td.find_all('li'))
47+
for a in option(li.find('a')).to_miter()
48+
for prefecture in mtry(lambda: a.string).to_miter()
49+
for url in mtry(lambda: a['href']).to_miter()
50+
).to_list()
51+
52+
53+
def clean_url(url):
54+
return url.split('#')[0]
55+
56+
57+
def scrape_hokenjo_urls(url: str):
58+
return {clean_url(u): True
59+
for _, _, u in scrape_hokenjo_table(url)}.keys()
60+
61+
62+
def is_header(row):
63+
return row[0] in ['設置主体:', '都道府県名']
64+
65+
66+
def main():
67+
urls = sys.argv[1:]
68+
writer = csv.writer(sys.stdout, csv.excel_tab)
69+
for u in urls:
70+
for uu in scrape_hokenjo_urls(u):
71+
for row in scrape_table_text(uu):
72+
if not is_header(row):
73+
writer.writerow(row)
74+
75+
76+
if __name__ == '__main__':
77+
main()

src/get_csv_files.ts

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
import * as CF from "cross-fetch"
2+
import * as E from "fp-ts/lib/Either"
3+
import * as TE from "fp-ts/lib/TaskEither"
4+
import { promises as fs } from "fs"
5+
import { array } from "fp-ts/lib/Array"
6+
import { pipe } from "fp-ts/lib/pipeable"
7+
import { TaskEither, taskEither, tryCatch } from "fp-ts/lib/TaskEither"
8+
import * as R from "rambda"
9+
10+
import { run } from "./under_util"
11+
12+
const makeSpreadsheetUrl = (key: string, gid: string): string =>
13+
`https://docs.google.com/spreadsheets/d/e/${key}/pub?` +
14+
`gid=${gid}&single=true&output=csv`
15+
16+
const fetchGoogleSpreadSheetCsv = (
17+
key: string,
18+
gid: string
19+
): TaskEither<Error, string> => {
20+
const url = makeSpreadsheetUrl(key, gid)
21+
return tryCatch(
22+
() => CF.fetch(url).then((x) => x.text()),
23+
E.toError
24+
)
25+
}
26+
27+
const makeCsvFile = (
28+
fn: string,
29+
key: string,
30+
gid: string
31+
): TaskEither<Error, string> => {
32+
const csv = `${fn}.csv`
33+
const writeM = (file: string, data: string): TaskEither<Error, void> =>
34+
tryCatch(() => fs.writeFile(csv, data), E.toError)
35+
36+
return pipe(
37+
fetchGoogleSpreadSheetCsv(key, gid),
38+
TE.chain(R.partial(writeM, csv)),
39+
TE.map(_ => csv)
40+
)
41+
}
42+
43+
const makeCsvFile_ = (
44+
[fn, key, gid]: [string, string, string]
45+
): TaskEither<Error, string> =>
46+
makeCsvFile(fn, key, gid)
47+
48+
// eslint-disable-next-line
49+
// noinspection JSUnusedLocalSymbols
50+
const cleanSheetName = (sheet: string): string =>
51+
sheet
52+
.trim()
53+
.toLowerCase()
54+
.replace('Foreign-language', '')
55+
.replace('[/- ]', '_')
56+
57+
// noinspection SpellCheckingInspection
58+
const key: string =
59+
'2PACX-1vRd6DGCaxlPwhfgpH_b9jhBHxJ-k-iVXmtOYDVq2w_' +
60+
'qJutKk8nKN4iToAqPjtUw7kzh7cZSJuRV8Yra'
61+
62+
// noinspection SpellCheckingInspection
63+
export const gids: Record<string, string> = {
64+
'hotlines': '2133443778',
65+
//'contents': '151378524',
66+
'hokkaido': '2127938906',
67+
'tohoku': '1010845727',
68+
'kanto': '1686906593',
69+
'chubu': '1422675244',
70+
'kansai': '843788725',
71+
'chugoku': '730111385',
72+
'shikoku': '623669625',
73+
'kyushu_okinawa': '1647012312',
74+
'menu': '151378524',
75+
'translations': '340977019',
76+
'webpages': '1947687836',
77+
}
78+
79+
// noinspection JSUnusedGlobalSymbols
80+
export const areas: Array<string> =
81+
['hotlines', 'hokkaido', 'tohoku', 'kanto', 'chubu', 'kansai', 'chugoku',
82+
'shikoku', 'kyushu_okinawa', 'all']
83+
84+
// noinspection JSUnusedGlobalSymbols
85+
export const sheets: Array<string> =
86+
Object.keys(gids)
87+
88+
export const makeFn = (sheet: string): string =>
89+
`data/${sheet}`
90+
91+
// noinspection JSUnusedGlobalSymbols
92+
export const getSheet = (fn: string): string =>
93+
fn.split('/')[3]
94+
95+
// noinspection JSUnusedGlobalSymbols
96+
export const fns: Array<string> =
97+
sheets.map(makeFn)
98+
99+
const fnKeyGids: Array<[string, string, string]> =
100+
Object
101+
.entries(gids)
102+
.map(([sheet, gid]) =>
103+
[makeFn(sheet), key, gid]
104+
)
105+
106+
const logError = (e: Error): void =>
107+
console.error(e)
108+
109+
// noinspection JSUnusedLocalSymbols
110+
// eslint-disable-next-line
111+
const logFile = (file: string): void =>
112+
console.log(`wrote file "${file}"`)
113+
114+
const logFiles = (files: Array<string>): void => {
115+
const msg = files.map(file => `wrote file "${file}"`).join('\n')
116+
console.log(msg)
117+
}
118+
119+
export const makeAllCsvFiles = (): Promise<void> => {
120+
const makeCsvFiles: TaskEither<Error, Array<string>> =
121+
array.traverse(taskEither)(fnKeyGids, makeCsvFile_)
122+
123+
return run(makeCsvFiles)
124+
.then(E.fold(logError, logFiles))
125+
.catch(logError)
126+
}
127+
128+
function main(): void {
129+
makeAllCsvFiles().then()
130+
}
131+
132+
if (require.main === module) {
133+
main()
134+
}

src/make_hotline_data.ts

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import { makeAllCsvFiles } from "./get_csv_files"
2+
import { writeCsvFileToJsonFiles } from "./make_json_files"
3+
4+
function main() {
5+
makeAllCsvFiles()
6+
.then(_ => writeCsvFileToJsonFiles())
7+
}
8+
9+
if (require.main === module) {
10+
main()
11+
}

0 commit comments

Comments
 (0)