15
15
* You should have received a copy of the GNU Affero General Public License
16
16
* along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
17
17
*/
18
-
19
18
package pl .edu .icm .coansys .kwdextraction ;
20
19
21
20
import java .io .*;
@@ -48,21 +47,21 @@ public class RakeExtractor {
48
47
private enum Lang {
49
48
50
49
// language code, stopwords path
51
- // DE("de", "stopwords/stopwords_de.txt"),
52
- // DK("dk", "stopwords/stopwords_dk.txt"),
53
- EN ("en" , "stopwords/stopwords_en.txt" ),
54
- // ES("es", "stopwords/stopwords_es.txt"),
55
- // FI("fi", "stopwords/stopwords_fi.txt"),
56
- FR ("fr" , "stopwords/stopwords_fr.txt" ),
57
- // HU("hu", "stopwords/stopwords_hu.txt"),
58
- // IT("it", "stopwords/stopwords_it.txt"),
59
- // NL("nl", "stopwords/stopwords_nl.txt"),
60
- // NO("no", "stopwords/stopwords_no.txt"),
61
- PL ("pl" , "stopwords/stopwords_pl.txt" );
62
- // PT("pt", "stopwords/stopwords_pt.txt"),
63
- // RU("ru", "stopwords/stopwords_ru.txt"),
64
- // SE("se", "stopwords/stopwords_se.txt"),
65
- // TR("tr", "stopwords/stopwords_tr.txt");
50
+ DE ("de" , "stopwords/stopwords_de.txt" ),
51
+ DK ("dk" , "stopwords/stopwords_dk.txt" ),
52
+ EN ("en" , "stopwords/stopwords_en.txt" ),
53
+ ES ("es" , "stopwords/stopwords_es.txt" ),
54
+ FI ("fi" , "stopwords/stopwords_fi.txt" ),
55
+ FR ("fr" , "stopwords/stopwords_fr.txt" ),
56
+ HU ("hu" , "stopwords/stopwords_hu.txt" ),
57
+ IT ("it" , "stopwords/stopwords_it.txt" ),
58
+ NL ("nl" , "stopwords/stopwords_nl.txt" ),
59
+ NO ("no" , "stopwords/stopwords_no.txt" ),
60
+ PL ("pl" , "stopwords/stopwords_pl.txt" ),
61
+ PT ("pt" , "stopwords/stopwords_pt.txt" ),
62
+ RU ("ru" , "stopwords/stopwords_ru.txt" ),
63
+ SE ("se" , "stopwords/stopwords_se.txt" ),
64
+ TR ("tr" , "stopwords/stopwords_tr.txt" );
66
65
private String langCode ;
67
66
private String stopwordsPath ;
68
67
@@ -108,12 +107,10 @@ private enum ExtractionOption {
108
107
}
109
108
110
109
/**
111
- * Every constructor sets this.content (document's content) and calls
112
- * prepareToExtraction()
110
+ * Every constructor sets this.content (document's content) and calls prepareToExtraction()
113
111
*
114
112
* @param content Document's content as a String
115
- * @param langCode Document's language (texts in other languages will be
116
- * ignored)
113
+ * @param langCode Document's language (texts in other languages will be ignored)
117
114
* @throws IOException
118
115
*/
119
116
public RakeExtractor (String content , String langCode ) throws IOException {
@@ -123,12 +120,10 @@ public RakeExtractor(String content, String langCode) throws IOException {
123
120
}
124
121
125
122
/**
126
- * Every constructor sets this.content (document's content) and calls
127
- * prepareToExtraction()
123
+ * Every constructor sets this.content (document's content) and calls prepareToExtraction()
128
124
*
129
125
* @param pdfContent Byte array containing a PDF file
130
- * @param langCode Document's language (texts in other languages will be
131
- * ignored)
126
+ * @param langCode Document's language (texts in other languages will be ignored)
132
127
* @throws AnalysisException
133
128
* @throws IOException
134
129
*/
@@ -139,13 +134,11 @@ public RakeExtractor(byte[] pdfContent, String langCode) throws AnalysisExceptio
139
134
}
140
135
141
136
/**
142
- * Every constructor sets this.content (document's content) and calls
143
- * prepareToExtraction()
137
+ * Every constructor sets this.content (document's content) and calls prepareToExtraction()
144
138
*
145
139
* @param docWrapper Protocol buffers message containing document
146
- * @param option specifies which parts of the document are searched while
147
- * extracting keywords. Possible values: ABSTRACT - only the abstract is
148
- * processed, CONTENT - on the body of the document is processed,
140
+ * @param option specifies which parts of the document are searched while extracting keywords. Possible values:
141
+ * ABSTRACT - only the abstract is processed, CONTENT - on the body of the document is processed,
149
142
* CONTENT_AND_ABSTRACT - both abstract and body are processed.
150
143
* @throws IOException
151
144
*/
@@ -183,8 +176,7 @@ public RakeExtractor(DocumentProtos.DocumentWrapper docWrapper, String option, S
183
176
* Extract text from pdf stream
184
177
*
185
178
* @param pdfContent content of pdf file
186
- * @param lang Document's language (texts in other languages will be
187
- * ignored)
179
+ * @param lang Document's language (texts in other languages will be ignored)
188
180
* @return String object containing document content
189
181
* @throws IOException
190
182
* @throws AnalysisException
@@ -220,8 +212,7 @@ private String filterTextByLang(String text, String language) throws IOException
220
212
}
221
213
222
214
/**
223
- * All steps of keyword extraction. Not to be called before setting of
224
- * this.content, this.lang and this.option.
215
+ * All steps of keyword extraction. Not to be called before setting of this.content, this.lang and this.option.
225
216
*
226
217
* @throws IOException
227
218
*/
@@ -246,7 +237,7 @@ private static Set<String> loadStopwords(Lang lang) throws IOException {
246
237
BufferedReader br = null ;
247
238
248
239
stopwordsStream = RakeExtractor .class .getClassLoader ().getResourceAsStream (lang .stopwordsPath );
249
-
240
+
250
241
try {
251
242
isr = new InputStreamReader (stopwordsStream , Charset .forName ("UTF-8" ));
252
243
br = new BufferedReader (isr );
@@ -259,16 +250,14 @@ private static Set<String> loadStopwords(Lang lang) throws IOException {
259
250
}
260
251
stopword = br .readLine ();
261
252
}
262
- }
263
- finally {
253
+ } finally {
264
254
IOUtils .closeQuietly (br );
265
255
}
266
256
return result ;
267
257
}
268
258
269
259
/**
270
- * Finding words or word sequences separated by stopwords, punctuation marks
271
- * etc.
260
+ * Finding words or word sequences separated by stopwords, punctuation marks etc.
272
261
*/
273
262
private void extractKeywordCandidates () {
274
263
@@ -412,14 +401,15 @@ public List<String> getKeywords(int n) {
412
401
413
402
private void setLang (String langCode ) {
414
403
this .lang = null ;
415
- for (Lang curr : Lang .values ()){
416
- if (curr .langCode .equals (langCode )){
417
- this .lang = curr ;
418
- break ;
419
- }
404
+ for (Lang curr : Lang .values ()) {
405
+ if (curr .langCode .equals (langCode )) {
406
+ this .lang = curr ;
407
+ System .err .println ("Wykryty język: " + langCode );
408
+ break ;
409
+ }
420
410
}
421
- if (this .lang == null ){
422
- this .lang = Lang .EN ;
411
+ if (this .lang == null ) {
412
+ this .lang = Lang .EN ;
423
413
}
424
414
}
425
415
0 commit comments