-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathatf2tei.py
173 lines (156 loc) · 6.56 KB
/
atf2tei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import re
from xml.sax.saxutils import escape
from pyoracc.atf.common.atffile import AtfFile
from pyoracc.model.line import Line
from pyoracc.model.oraccobject import OraccObject
from pyoracc.model.ruling import Ruling
from pyoracc.model.state import State
from pyoracc.model.translation import Translation
import tei
verbose = False
def convert(atf_text):
"""
Create a TEI representation of a file-like object containing ATF.
"""
# Parse the ATF input string.
atf = AtfFile(atf_text, 'cdli', False)
if verbose:
print("Parsed {} -- {}".format(atf.text.code, atf.text.description))
# Construct a TEI Document to hold the converted text.
doc = tei.Document()
doc.language = atf.text.language
doc.header = tei.Header()
doc.header.title = atf.text.description
doc.header.cdli_code = atf.text.code
# Traverse the parse tree, recording lines under labels.
translations = {}
objects = [item for item in atf.text.children
if isinstance(item, OraccObject)]
edition = tei.Edition()
doc.parts.append(edition)
for item in objects:
part = tei.TextPart(item.objecttype)
edition.append(part)
for section in item.children:
if isinstance(section, OraccObject):
try:
name = section.name
except AttributeError:
name = section.objecttype
div = tei.TextPart(name)
part.append(div)
elif isinstance(section, Translation):
# Handle in another pass.
continue
else:
print('Skipping unknown section type',
type(section).__name__)
continue
for obj in section.children:
if isinstance(obj, Line):
text = normalize_transliteration(obj.words)
line = tei.Line(obj.label, text)
div.append(line)
# Older pyoracc parses interlinear translatsions
# as notes. Remember them for serialization below.
for note in obj.notes:
if note.content.startswith('tr.'):
lang, text = note.content.split(':', maxsplit=1)
_, lang = lang.split('.')
# tr.ts is used for normalization, so mark
# this with the primary object's language.
if lang == 'ts':
lang == atf.text.language
tr_line = Line(obj.label)
tr_line.words = text.strip().split()
if lang not in translations:
translations[lang] = []
translations[lang].append(tr_line)
elif isinstance(obj, State) or isinstance(obj, Ruling):
text = str(obj).strip()
# Strip the initial '$' off the ATF representation.
text = text[1:].strip()
div.append(tei.Note(text))
else:
print('Skipping unknown section child type',
type(obj).__name__)
continue
# Add accumulated interlinear translations to the document.
for lang, tr_lines in translations.items():
translation = tei.Translation()
translation.language = lang
doc.parts.append(translation)
for tr_line in tr_lines:
text = ' '.join(tr_line.words)
line = tei.Line(tr_line.label, text)
translation.append(line)
# Traverse the tree again, recording any parallel translation sections.
# pyoracc only supports these for English.
translation = tei.Translation()
translation.language = 'eng'
translation_empty = True
for item in objects:
part = tei.TextPart(item.objecttype)
translation.append(part)
for section in item.children:
# Skip anything which is not a translation for this pass.
if not isinstance(section, Translation):
continue
for surface in section.children:
if isinstance(surface, OraccObject):
div = tei.TextPart(surface.objecttype)
part.append(div)
for obj in surface.children:
if isinstance(obj, Line):
text = ' '.join(obj.words)
line = tei.Line(obj.label, text)
div.append(line)
translation_empty = False
else:
print('Skipping unknown section child type',
{type(obj).__name__})
continue
if not translation_empty:
doc.parts.append(translation)
return doc
def normalize_transliteration(words):
'Convert a sequence of words from atf to standard formatting.'
# See http://oracc.org/doc/help/editinginatf/primer/inlinetutorial/
result = []
for word in words:
# Convert digraphs to corresponding unicode characters.
word = re.sub(r'sz', 'š', word) # \u0161
word = re.sub(r'SZ', 'Š', word) # \u0160
word = re.sub(r's,', 'ṣ', word) # \u1E63
word = re.sub(r'S,', 'Ṣ', word) # \u1E62
word = re.sub(r't,', 'ṭ', word) # \u1E6D
word = re.sub(r'T,', 'Ṭ', word) # \u1E6C
word = re.sub(r's\'', 'ś', word) # \u015B
word = re.sub(r'S\'', 'Ś', word) # \u015A
word = re.sub(r'h,', 'ḫ', word) # \u1E2B
word = re.sub(r'H,', 'Ḫ', word) # \u1E2A
word = re.sub(r'j', 'ŋ', word) # \u014B
word = re.sub(r'J', 'Ŋ', word) # \u014A
# Convert damage marks to half-brackets.
marked = [
'⸢' + sign[:-1] + '⸣' if sign.endswith('#')
else sign
for sign in word.split('-')
]
word = '-'.join(marked)
# XML-escape the result.
word = escape(word)
# Convert markup to tei elements.
# TODO: <c type="determinative">
# TODO: <c type="sign" subtype="logo">
result.append(word)
return ' '.join(result)
if __name__ == '__main__':
import io
import sys
for filename in sys.argv[1:]:
with io.open(filename, encoding='utf-8') as f:
doc = convert(f.read())
print(doc)