-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconstants.py
159 lines (151 loc) · 4.65 KB
/
constants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import xooxle
import utils
# Xooxle search will work fine even if we don't retain any HTML tags, because it
# relies entirely on searching the text payloads of the HTML. However, we retain
# the subset of the classes that are needed for highlighting, in order to make
# the Xooxle search results pretty.
_DIALECTS = {
"S",
"Sa",
"Sf",
"A",
"sA",
"B",
"F",
"Fb",
"O",
# The following dialects are only found in Marcion.
"NH",
# The following dialects are only found in TLA / KELLIA.
"Ak",
"M",
"L",
"P",
"V",
"W",
"U",
"K",
}
_CRUM_RETAIN_CLASSES = {
"word",
"dialect",
"spelling",
"type",
"dialect-comma",
"spelling-comma",
"dialect-parenthesis",
} | _DIALECTS
_KELLIA_RETAIN_CLASSES = {
"word",
"spelling",
"dialect",
"type",
"lang",
"geo",
"gram_grp",
} | _DIALECTS
_COPTICSITE_RETAIN_CLASSES = {
"word",
"spelling",
} | _DIALECTS
_CRUM_INDEX = xooxle.index(
"site/data/xooxle/crum.json",
xooxle.subindex(
input="flashcards/data/output/web/a_coptic_dictionary__all_dialects/",
include=lambda file: utils.stem(file).isdigit(),
extract=[
xooxle.selector({"name": "title"}, force=False),
xooxle.selector({"class_": "header"}, force=False),
xooxle.selector({"class_": "dictionary"}, force=False),
xooxle.selector({"class_": "crum"}, force=False),
xooxle.selector({"class_": "crum-page"}, force=False),
xooxle.selector({"class_": "crum-page-external"}, force=False),
xooxle.selector({"class_": "dawoud"}, force=False),
xooxle.selector({"class_": "dawoud-page"}, force=False),
xooxle.selector(
{"class_": "dawoud-page-external"},
force=False,
),
xooxle.selector({"class_": "drv-key"}, force=False),
xooxle.selector({"id": "images"}, force=False),
xooxle.selector({"class_": "nag-hammadi"}, force=False),
xooxle.selector({"class_": "sisters"}, force=False),
xooxle.selector({"id": "pretty"}),
xooxle.selector({"id": "categories"}, force=False),
],
captures=[
xooxle.capture(
"marcion",
xooxle.selector({"id": "marcion"}),
# This is the list of classes needed for highlighting. If the
# highlighting rules change, you might have to add new classes!
retain_classes=_CRUM_RETAIN_CLASSES,
),
xooxle.capture(
"meaning",
xooxle.selector({"id": "root-type-meaning"}, force=False),
retain_classes=_CRUM_RETAIN_CLASSES,
),
xooxle.capture(
"appendix",
xooxle.selector(
{"name": "body"},
),
retain_classes=_CRUM_RETAIN_CLASSES,
unit_tags={"tr", "div", "hr"},
block_elements=xooxle.BLOCK_ELEMENTS_DEFAULT | {"td"},
),
],
result_table_name="crum",
href_fmt="{KEY}.html",
),
xooxle.subindex(
input="flashcards/data/output/web/kellia__comprehensive/",
extract=[
xooxle.selector({"name": "footer"}, force=False),
xooxle.selector({"class_": "bibl"}, force=False),
xooxle.selector({"class_": "ref_xr"}, force=False),
xooxle.selector({"class_": "ref"}, force=False),
],
captures=[
xooxle.capture(
"orths",
xooxle.selector({"id": "orths"}),
retain_classes=_KELLIA_RETAIN_CLASSES,
),
xooxle.capture(
"senses",
xooxle.selector({"id": "senses"}),
retain_classes=_KELLIA_RETAIN_CLASSES,
),
xooxle.capture(
"text",
xooxle.selector(
{"name": "body"},
),
),
],
result_table_name="kellia",
href_fmt="https://coptic-dictionary.org/entry.cgi?tla={KEY}",
),
xooxle.subindex(
input="flashcards/data/output/web/copticsite.com/",
extract=[],
captures=[
xooxle.capture(
"front",
xooxle.selector({"id": "front"}),
retain_classes=_COPTICSITE_RETAIN_CLASSES,
),
xooxle.capture(
"back",
xooxle.selector({"id": "back"}),
),
],
result_table_name="copticsite",
href_fmt="",
),
)
INDEXES = [
_CRUM_INDEX,
]