-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmeaning.py
211 lines (179 loc) · 7.33 KB
/
meaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
"""This module preprocesses the data that can be extracted from the Duden online dictionary.
Functions:
delete_umlauts(string) -> string
remove_brackets_hyphens(string) -> string
remove_nonessential_words_symbols(string) -> string
remove_abbreviations(string) -> string
remove_dot(string) -> string
clean_sentence(string) -> string
remove_one_word_result_multiples(pandas.DataFrame) -> pandas.DataFrame
split_semicolon(pandas.DataFrame) -> pandas.DataFrame
split_semicolon_including_raw(pandas.DataFrame) -> pandas.DataFrame
"""
import re
import duden
def delete_umlauts(text: str):
"""Converts German diacritics to the respective non-diacritic equivalent.
Args:
text: A string.
Returns:
The string without diacritics.
"""
umlauts = {ord('ä'):'ae', ord('ö'):'oe', ord('ü'):'ue', ord('ß'):'sz'}
return text.translate(umlauts)
def remove_brackets_hyphens(text: str):
"""Removes square brackets, hyphens and curved brackets including the text inside them.
Args:
text: A string.
Returns:
The string without square brackets, hyphens and curved brackets and the text that was inside the curved brackets.
"""
return re.sub(" \(.*?\)|\(.*?\) |\[|\]|–[^–]+– ", "", text)
def remove_nonessential_words_symbols(sentence: str):
"""Removes symbols and phrases that are not part of the definition of a word.
Args:
sentence: A string.
Returns:
The string without these symbols and phrases.
"""
if '💡' in sentence:
index = sentence.find('\n\n💡')
sentence = sentence[:index]
if 'Kurzform für' in sentence:
index = sentence.find('\n')
index = index + 2
sentence = sentence[index:]
if 'Kurzform' in sentence:
index = sentence.find('\nKurzform')
sentence = sentence[:index]
if 'Abkürzung' in sentence:
index = sentence.find('\nAbkürzung')
sentence = sentence[:index]
if 'Kurzwort für' in sentence:
index = sentence.find('\n')
index = index + 2
sentence = sentence[index:]
if 'Kurzwort' in sentence:
index = sentence.find('\nKurzwort')
sentence = sentence[:index]
if '\n\nHerkunft' in sentence:
index = sentence.find('\n\nHerkunft')
sentence = sentence[:index]
if '\nHerkunft' in sentence:
index = sentence.find('\nHerkunft')
sentence = sentence[:index]
return sentence
def remove_abbreviations(sentence: str):
"""Removes the abbreviations "o. Ä.", "o. ä.", "u.a.", and "u. Ä.".
Args:
sentence: A string.
Returns:
The string without the abbreviations.
"""
abbreviations = [" o. Ä.", " o. ä.", " u. a.", " u. Ä."]
for word in abbreviations:
sentence = sentence.replace(word, "")
return sentence
def remove_dot(sentence: str):
"""Removes dots from a string if it does not contain "bzw.".
Args:
sentence: A string.
Returns:
The string without dots.
"""
if "bzw." not in sentence:
return re.sub("\.", "", sentence)
else:
return sentence
def clean_sentence(sentence: str):
"""Applies the functions del_abbreviations, del_dots and clean_sentence to a string.
Args:
sentence: A string.
Returns:
The string after those functions were applied.
"""
del_abbreviations = remove_abbreviations(sentence)
del_dots = remove_dot(del_abbreviations)
clean_sentence = remove_nonessential_words_symbols(del_dots)
return clean_sentence
def search_duden_meaning(word: str):
"""Extracts the meaning of a word from the Duden online dictionary.
Args:
word: A string containing one word.
Returns:
The meaning of a word or None if no entry can be found in the dictionary.
"""
without_umlauts = delete_umlauts(word)
search_word = duden.get(without_umlauts)
if search_word != None:
meaning = search_word.meaning_overview
if meaning == "":
return None
else:
return meaning
else:
return None
def remove_one_word_result_multiples(df):
"""Removes meanings containing only one word for compounds that have multiple entries in the data frame.
Args:
df: A pandas.DataFrame with the columns Meaning and Compound.
Returns:
The pandas.DataFrame without the one word results for compounds that have multiple meaning entries.
"""
for index, row in df.iterrows():
if row['Meaning'] != None and len(row['Meaning'].split()) == 1: #only synonym as meaning
if df['Compound'].value_counts()[row['Compound']] > 1:
word = row['Compound']
count = df['Compound'].value_counts()[word]
number = 0
multiple = []
while number < count:
find_meaning = df.loc[df['Compound'] == word, 'Meaning'].iloc[number]
multiple.append(find_meaning)
number = number + 1
for result in multiple:
delete = False
if len(result.split()) > 1:
delete = True
for result in multiple:
if len(result.split()) == 1 and delete is True:
if row['Meaning'] == result:
df = df[df.Meaning != result]
return df
def split_semicolon(df):
"""Creates a new row in the data frame if the meanings of a compound are seperated by a semicolon.
Args:
df: A pandas.DataFrame with the columns Meaning, Relation and Compound.
Returns:
The pandas.DataFrame with a row for each meaning of a compound that has multiple meanings that were seperated by a semicolon.
"""
for index, row in df.iterrows():
if row['Meaning'] != None and ";" in row['Meaning']:
compound = row['Compound']
result_list = row['Meaning'].split("; ")
find_relation = df.loc[df['Compound'] == compound, 'Relation'].iloc[0]
df.loc[index] = compound, find_relation, result_list[0]
del result_list[0]
index = index - 0.5
for result in result_list:
df.loc[index] = compound, find_relation, result
return df
def split_semicolon_including_raw(df):
"""Creates a new row in the data frame if the meanings of a compound are seperated by a semicolon.
Args:
df: A pandas.DataFrame with the columns Meaning, Raw_Meaning, Relation and Compound.
Returns:
The pandas.DataFrame with a row for each meaning of a compound that has multiple meanings that were separated by a semicolon.
"""
for index, row in df.iterrows():
if row['Meaning'] != None and ";" in row['Meaning']:
compound = row['Compound']
result_list = row['Meaning'].split("; ")
find_relation = df.loc[df['Compound'] == compound, 'Relation'].iloc[0]
find_raw = df.loc[df['Compound'] == compound, 'Raw_Meaning'].iloc[0]
df.loc[index] = compound, find_relation, find_raw, result_list[0]
del result_list[0]
for result in result_list:
index = index - 0.5
df.loc[index] = compound, find_relation, find_raw, result
return df