Skip to content

Commit bb8d420

Browse files
committed
merge of incoming version of RakeExtractor
2 parents 3cc2274 + bec88b9 commit bb8d420

File tree

1 file changed

+35
-45
lines changed
  • keywords-extraction/keywords-extraction-impl/src/main/java/pl/edu/icm/coansys/kwdextraction

1 file changed

+35
-45
lines changed

keywords-extraction/keywords-extraction-impl/src/main/java/pl/edu/icm/coansys/kwdextraction/RakeExtractor.java

+35-45
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
* You should have received a copy of the GNU Affero General Public License
1616
* along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
1717
*/
18-
1918
package pl.edu.icm.coansys.kwdextraction;
2019

2120
import java.io.*;
@@ -48,21 +47,21 @@ public class RakeExtractor {
4847
private enum Lang {
4948

5049
// language code, stopwords path
51-
// DE("de", "stopwords/stopwords_de.txt"),
52-
// DK("dk", "stopwords/stopwords_dk.txt"),
53-
EN("en", "stopwords/stopwords_en.txt"),
54-
// ES("es", "stopwords/stopwords_es.txt"),
55-
// FI("fi", "stopwords/stopwords_fi.txt"),
56-
FR("fr", "stopwords/stopwords_fr.txt"),
57-
// HU("hu", "stopwords/stopwords_hu.txt"),
58-
// IT("it", "stopwords/stopwords_it.txt"),
59-
// NL("nl", "stopwords/stopwords_nl.txt"),
60-
// NO("no", "stopwords/stopwords_no.txt"),
61-
PL("pl", "stopwords/stopwords_pl.txt");
62-
// PT("pt", "stopwords/stopwords_pt.txt"),
63-
// RU("ru", "stopwords/stopwords_ru.txt"),
64-
// SE("se", "stopwords/stopwords_se.txt"),
65-
// TR("tr", "stopwords/stopwords_tr.txt");
50+
DE("de", "stopwords/stopwords_de.txt"),
51+
DK("dk", "stopwords/stopwords_dk.txt"),
52+
EN("en", "stopwords/stopwords_en.txt"),
53+
ES("es", "stopwords/stopwords_es.txt"),
54+
FI("fi", "stopwords/stopwords_fi.txt"),
55+
FR("fr", "stopwords/stopwords_fr.txt"),
56+
HU("hu", "stopwords/stopwords_hu.txt"),
57+
IT("it", "stopwords/stopwords_it.txt"),
58+
NL("nl", "stopwords/stopwords_nl.txt"),
59+
NO("no", "stopwords/stopwords_no.txt"),
60+
PL("pl", "stopwords/stopwords_pl.txt"),
61+
PT("pt", "stopwords/stopwords_pt.txt"),
62+
RU("ru", "stopwords/stopwords_ru.txt"),
63+
SE("se", "stopwords/stopwords_se.txt"),
64+
TR("tr", "stopwords/stopwords_tr.txt");
6665
private String langCode;
6766
private String stopwordsPath;
6867

@@ -108,12 +107,10 @@ private enum ExtractionOption {
108107
}
109108

110109
/**
111-
* Every constructor sets this.content (document's content) and calls
112-
* prepareToExtraction()
110+
* Every constructor sets this.content (document's content) and calls prepareToExtraction()
113111
*
114112
* @param content Document's content as a String
115-
* @param langCode Document's language (texts in other languages will be
116-
* ignored)
113+
* @param langCode Document's language (texts in other languages will be ignored)
117114
* @throws IOException
118115
*/
119116
public RakeExtractor(String content, String langCode) throws IOException {
@@ -123,12 +120,10 @@ public RakeExtractor(String content, String langCode) throws IOException {
123120
}
124121

125122
/**
126-
* Every constructor sets this.content (document's content) and calls
127-
* prepareToExtraction()
123+
* Every constructor sets this.content (document's content) and calls prepareToExtraction()
128124
*
129125
* @param pdfContent Byte array containing a PDF file
130-
* @param langCode Document's language (texts in other languages will be
131-
* ignored)
126+
* @param langCode Document's language (texts in other languages will be ignored)
132127
* @throws AnalysisException
133128
* @throws IOException
134129
*/
@@ -139,13 +134,11 @@ public RakeExtractor(byte[] pdfContent, String langCode) throws AnalysisExceptio
139134
}
140135

141136
/**
142-
* Every constructor sets this.content (document's content) and calls
143-
* prepareToExtraction()
137+
* Every constructor sets this.content (document's content) and calls prepareToExtraction()
144138
*
145139
* @param docWrapper Protocol buffers message containing document
146-
* @param option specifies which parts of the document are searched while
147-
* extracting keywords. Possible values: ABSTRACT - only the abstract is
148-
* processed, CONTENT - on the body of the document is processed,
140+
* @param option specifies which parts of the document are searched while extracting keywords. Possible values:
141+
* ABSTRACT - only the abstract is processed, CONTENT - on the body of the document is processed,
149142
* CONTENT_AND_ABSTRACT - both abstract and body are processed.
150143
* @throws IOException
151144
*/
@@ -183,8 +176,7 @@ public RakeExtractor(DocumentProtos.DocumentWrapper docWrapper, String option, S
183176
* Extract text from pdf stream
184177
*
185178
* @param pdfContent content of pdf file
186-
* @param lang Document's language (texts in other languages will be
187-
* ignored)
179+
* @param lang Document's language (texts in other languages will be ignored)
188180
* @return String object containing document content
189181
* @throws IOException
190182
* @throws AnalysisException
@@ -220,8 +212,7 @@ private String filterTextByLang(String text, String language) throws IOException
220212
}
221213

222214
/**
223-
* All steps of keyword extraction. Not to be called before setting of
224-
* this.content, this.lang and this.option.
215+
* All steps of keyword extraction. Not to be called before setting of this.content, this.lang and this.option.
225216
*
226217
* @throws IOException
227218
*/
@@ -246,7 +237,7 @@ private static Set<String> loadStopwords(Lang lang) throws IOException {
246237
BufferedReader br = null;
247238

248239
stopwordsStream = RakeExtractor.class.getClassLoader().getResourceAsStream(lang.stopwordsPath);
249-
240+
250241
try {
251242
isr = new InputStreamReader(stopwordsStream, Charset.forName("UTF-8"));
252243
br = new BufferedReader(isr);
@@ -259,16 +250,14 @@ private static Set<String> loadStopwords(Lang lang) throws IOException {
259250
}
260251
stopword = br.readLine();
261252
}
262-
}
263-
finally {
253+
} finally {
264254
IOUtils.closeQuietly(br);
265255
}
266256
return result;
267257
}
268258

269259
/**
270-
* Finding words or word sequences separated by stopwords, punctuation marks
271-
* etc.
260+
* Finding words or word sequences separated by stopwords, punctuation marks etc.
272261
*/
273262
private void extractKeywordCandidates() {
274263

@@ -412,14 +401,15 @@ public List<String> getKeywords(int n) {
412401

413402
private void setLang(String langCode) {
414403
this.lang = null;
415-
for(Lang curr : Lang.values()){
416-
if(curr.langCode.equals(langCode)){
417-
this.lang = curr;
418-
break;
419-
}
404+
for (Lang curr : Lang.values()) {
405+
if (curr.langCode.equals(langCode)) {
406+
this.lang = curr;
407+
System.err.println("Wykryty język: " + langCode);
408+
break;
409+
}
420410
}
421-
if(this.lang==null){
422-
this.lang = Lang.EN;
411+
if (this.lang == null) {
412+
this.lang = Lang.EN;
423413
}
424414
}
425415

0 commit comments

Comments
 (0)