Skip to content

Commit c95feca

Browse files
committed
Merge branch 'master' of git://github.com/CeON/CoAnSys
2 parents bb8d420 + e0b1d93 commit c95feca

File tree

115 files changed

+1062
-17795
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+1062
-17795
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
.project
99
.settings
1010

11+
# Netbeans files #
12+
nb-configuration.xml
13+
1114
# IntelliJ IDEA files #
1215
.idea
1316
*.iml
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
<!--
2+
~ This file is part of CoAnSys project.
3+
~ Copyright (c) 2012-2014 ICM-UW
4+
~
5+
~ CoAnSys is free software: you can redistribute it and/or modify
6+
~ it under the terms of the GNU Affero General Public License as published by
7+
~ the Free Software Foundation, either version 3 of the License, or
8+
~ (at your option) any later version.
9+
~
10+
~ CoAnSys is distributed in the hope that it will be useful,
11+
~ but WITHOUT ANY WARRANTY; without even the implied warranty of
12+
~ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13+
~ GNU Affero General Public License for more details.
14+
~
15+
~ You should have received a copy of the GNU Affero General Public License
16+
~ along with CoAnSys. If not, see <http://www.gnu.org/licenses/>.
17+
-->
18+
19+
<workflow-app xmlns="uri:oozie:workflow:0.4" name="citations-core-heuristic-workflow">
20+
<parameters>
21+
<property>
22+
<name>jobTracker</name>
23+
</property>
24+
<property>
25+
<name>nameNode</name>
26+
</property>
27+
<property>
28+
<name>queueName</name>
29+
<value>default</value>
30+
</property>
31+
<property>
32+
<name>pool</name>
33+
<value>default</value>
34+
</property>
35+
<property>
36+
<name>reduceTasks</name>
37+
<value>36</value>
38+
</property>
39+
<property>
40+
<name>workingDirectory</name>
41+
</property>
42+
<property>
43+
<name>sourceEntities</name>
44+
</property>
45+
<property>
46+
<name>destinationEntities</name>
47+
</property>
48+
<property>
49+
<name>citationHasher</name>
50+
</property>
51+
<property>
52+
<name>documentHasher</name>
53+
</property>
54+
<property>
55+
<name>output</name>
56+
</property>
57+
<property>
58+
<name>unmatched</name>
59+
<value>SKIP_UNMATCHED_COMPUTATION</value>
60+
</property>
61+
</parameters>
62+
<global>
63+
<job-tracker>${jobTracker}</job-tracker>
64+
<name-node>${nameNode}</name-node>
65+
<configuration>
66+
<property>
67+
<name>mapred.mapper.new-api</name>
68+
<value>true</value>
69+
</property>
70+
<property>
71+
<name>mapred.reducer.new-api</name>
72+
<value>true</value>
73+
</property>
74+
<property>
75+
<name>mapred.job.queue.name</name>
76+
<value>${queueName}</value>
77+
</property>
78+
<property>
79+
<name>oozie.launcher.mapred.fairscheduler.pool</name>
80+
<value>${pool}</value>
81+
</property>
82+
<property>
83+
<name>mapred.fairscheduler.pool</name>
84+
<value>${pool}</value>
85+
</property>
86+
<property>
87+
<name>mapreduce.inputformat.class</name>
88+
<value>org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat</value>
89+
</property>
90+
<property>
91+
<name>mapreduce.outputformat.class</name>
92+
<value>org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat</value>
93+
</property>
94+
<property>
95+
<name>mapred.reduce.tasks</name>
96+
<value>${reduceTasks}</value>
97+
</property>
98+
</configuration>
99+
</global>
100+
<start to="prepare"/>
101+
102+
<action name="prepare">
103+
<fs>
104+
<delete path="${workingDirectory}"/>
105+
<mkdir path="${workingDirectory}"/>
106+
</fs>
107+
<ok to="heuristic"/>
108+
<error to="fail"/>
109+
</action>
110+
111+
<action name="heuristic">
112+
<map-reduce>
113+
<configuration>
114+
<property>
115+
<name>mapreduce.inputformat.class</name>
116+
<value>org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat</value>
117+
</property>
118+
<property>
119+
<name>mapreduce.map.class</name>
120+
<value>org.apache.hadoop.mapreduce.lib.input.DelegatingMapper</value>
121+
</property>
122+
<property>
123+
<name>mapred.input.dir.mappers</name>
124+
<value>
125+
${sourceEntities};pl.edu.icm.coansys.citations.mappers.CitationHashGenerator,${destinationEntities};pl.edu.icm.coansys.citations.mappers.DocumentHashGenerator
126+
</value>
127+
</property>
128+
<property>
129+
<name>coansys.citations.citation.hasher</name>
130+
<value>${citationHasher}</value>
131+
</property>
132+
<property>
133+
<name>coansys.citations.mark.citations</name>
134+
<value>false</value>
135+
</property>
136+
<property>
137+
<name>coansys.citations.document.hasher</name>
138+
<value>${documentHasher}</value>
139+
</property>
140+
<property>
141+
<name>coansys.citations.mark.documents</name>
142+
<value>true</value>
143+
</property>
144+
<property>
145+
<name>mapred.input.dir.formats</name>
146+
<value>
147+
${sourceEntities};org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat,${destinationEntities};org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat
148+
</value>
149+
</property>
150+
<property>
151+
<name>mapred.mapoutput.key.class</name>
152+
<value>pl.edu.icm.coansys.citations.data.MarkedText</value>
153+
</property>
154+
<property>
155+
<name>mapred.mapoutput.value.class</name>
156+
<value>pl.edu.icm.coansys.citations.data.MarkedText</value>
157+
</property>
158+
<property>
159+
<name>mapreduce.partitioner.class</name>
160+
<value>pl.edu.icm.coansys.citations.data.MarkedTextPartitioner</value>
161+
</property>
162+
<property>
163+
<name>mapred.output.value.groupfn.class</name>
164+
<value>pl.edu.icm.coansys.citations.data.MarkedTextGroupComparator</value>
165+
</property>
166+
<property>
167+
<name>mapred.output.key.comparator.class</name>
168+
<value>pl.edu.icm.coansys.citations.data.MarkedTextSortComparator</value>
169+
</property>
170+
<property>
171+
<name>mapreduce.reduce.class</name>
172+
<value>pl.edu.icm.coansys.citations.reducers.HashJoiner</value>
173+
</property>
174+
<property>
175+
<name>mapred.output.dir</name>
176+
<value>${workingDirectory}/heuristic_with_dups</value>
177+
</property>
178+
<property>
179+
<name>mapred.output.key.class</name>
180+
<value>org.apache.hadoop.io.Text</value>
181+
</property>
182+
<property>
183+
<name>mapred.output.value.class</name>
184+
<value>org.apache.hadoop.io.Text</value>
185+
</property>
186+
</configuration>
187+
</map-reduce>
188+
<ok to="distinctor"/>
189+
<error to="fail"/>
190+
</action>
191+
192+
<action name="distinctor">
193+
<map-reduce>
194+
<configuration>
195+
<property>
196+
<name>mapreduce.map.class</name>
197+
<value>pl.edu.icm.coansys.citations.mappers.IdCombiner</value>
198+
</property>
199+
<property>
200+
<name>mapreduce.reduce.class</name>
201+
<value>pl.edu.icm.coansys.citations.reducers.IdDistinctorExtractor</value>
202+
</property>
203+
<property>
204+
<name>mapred.input.dir</name>
205+
<value>${workingDirectory}/heuristic_with_dups</value>
206+
</property>
207+
<property>
208+
<name>mapred.output.dir</name>
209+
<value>${output}</value>
210+
</property>
211+
<property>
212+
<name>mapred.mapoutput.key.class</name>
213+
<value>org.apache.hadoop.io.Text</value>
214+
</property>
215+
<property>
216+
<name>mapred.mapoutput.value.class</name>
217+
<value>org.apache.hadoop.io.NullWritable</value>
218+
</property>
219+
<property>
220+
<name>mapred.output.key.class</name>
221+
<value>org.apache.hadoop.io.Text</value>
222+
</property>
223+
<property>
224+
<name>mapred.output.value.class</name>
225+
<value>org.apache.hadoop.io.Text</value>
226+
</property>
227+
</configuration>
228+
</map-reduce>
229+
<ok to="need-unmatched"/>
230+
<error to="fail"/>
231+
</action>
232+
233+
<decision name="need-unmatched">
234+
<switch>
235+
<case to="end">${unmatched == "SKIP_UNMATCHED_COMPUTATION"}</case>
236+
<default to="unmatched"/>
237+
</switch>
238+
</decision>
239+
240+
<action name="unmatched">
241+
<map-reduce>
242+
<configuration>
243+
<property>
244+
<name>mapreduce.inputformat.class</name>
245+
<value>org.apache.hadoop.mapreduce.lib.input.DelegatingInputFormat</value>
246+
</property>
247+
<property>
248+
<name>mapreduce.map.class</name>
249+
<value>org.apache.hadoop.mapreduce.lib.input.DelegatingMapper</value>
250+
</property>
251+
<property>
252+
<name>mapred.input.dir.mappers</name>
253+
<value>
254+
${sourceEntities};pl.edu.icm.coansys.citations.mappers.CitationsMarker,${output};pl.edu.icm.coansys.citations.mappers.HeuristicMarker
255+
</value>
256+
</property>
257+
<property>
258+
<name>mapred.input.dir.formats</name>
259+
<value>
260+
${sourceEntities};org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat,${output};org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat
261+
</value>
262+
</property>
263+
<property>
264+
<name>mapred.mapoutput.key.class</name>
265+
<value>pl.edu.icm.coansys.citations.data.MarkedText</value>
266+
</property>
267+
<property>
268+
<name>mapred.mapoutput.value.class</name>
269+
<value>pl.edu.icm.coansys.citations.data.MarkedBytesWritable</value>
270+
</property>
271+
<property>
272+
<name>mapreduce.partitioner.class</name>
273+
<value>pl.edu.icm.coansys.citations.data.MarkedTextPartitioner</value>
274+
</property>
275+
<property>
276+
<name>mapred.output.value.groupfn.class</name>
277+
<value>pl.edu.icm.coansys.citations.data.MarkedTextGroupComparator</value>
278+
</property>
279+
<property>
280+
<name>mapred.output.key.comparator.class</name>
281+
<value>pl.edu.icm.coansys.citations.data.MarkedTextSortComparator</value>
282+
</property>
283+
<property>
284+
<name>mapreduce.reduce.class</name>
285+
<value>pl.edu.icm.coansys.citations.reducers.Unmatched</value>
286+
</property>
287+
<property>
288+
<name>mapred.output.dir</name>
289+
<value>${unmatched}</value>
290+
</property>
291+
<property>
292+
<name>mapred.output.key.class</name>
293+
<value>org.apache.hadoop.io.Text</value>
294+
</property>
295+
<property>
296+
<name>mapred.output.value.class</name>
297+
<value>org.apache.hadoop.io.BytesWritable</value>
298+
</property>
299+
</configuration>
300+
</map-reduce>
301+
<ok to="end"/>
302+
<error to="fail"/>
303+
</action>
304+
305+
<kill name="fail">
306+
<message>Workflow failed, error message [${wf:errorMessage(wf:lastErrorNode())}]</message>
307+
</kill>
308+
<end name="end"/>
309+
</workflow-app>

0 commit comments

Comments
 (0)