This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch cleanup/drop-wikinews-importer-component in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit ae856879e4e5b5e91319feb3c4f5e9bb8bcfc313 Author: Martin Wiesner <[email protected]> AuthorDate: Fri Mar 20 22:09:40 2026 +0100 Drop wikinews-importer component - reason: Niche, no tests, no value, stale dependency (bliki-core 2013) --- pom.xml | 1 - wikinews-importer/bin/converter | 21 -- wikinews-importer/pom.xml | 79 ------- .../FTC_begins_antitrust_inquiry_of_Google.xmi | 77 ------ .../Internet_hacking_group_LulzSec_disbands.xmi | 23 -- ...a_announces_troop_reductions_in_Afghanistan.xmi | 24 -- ...i\304\207_said_to_be_too_ill_to_face_trial.xmi" | 28 --- wikinews-importer/samples/TypeSystem.xml | 106 --------- .../samples/US_actor_Peter_Falk_dies_aged_83.xmi | 22 -- wikinews-importer/samples/wikinews.xml | 107 --------- .../wikinews_importer/AnnotatingMarkupParser.java | 262 --------------------- .../opennlp/wikinews_importer/Annotation.java | 37 --- .../apache/opennlp/wikinews_importer/UimaUtil.java | 131 ----------- .../wikinews_importer/WikinewsConverter.java | 196 --------------- .../wikinews_importer/WikinewsWikiModel.java | 48 ---- 15 files changed, 1162 deletions(-) diff --git a/pom.xml b/pom.xml index 90f6c33..d784dcc 100644 --- a/pom.xml +++ b/pom.xml @@ -111,7 +111,6 @@ <module>tf-ner-poc</module> <module>summarizer</module> <module>tagging-server</module> - <module>wikinews-importer</module> </modules> <properties> diff --git a/wikinews-importer/bin/converter b/wikinews-importer/bin/converter deleted file mode 100755 index 3b0285c..0000000 --- a/wikinews-importer/bin/converter +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/sh - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -mvn -e -q exec:java "-Dexec.mainClass=org.apache.opennlp.wikinews_importer.WikinewsConverter" "-Dexec.args=$*" diff --git a/wikinews-importer/pom.xml b/wikinews-importer/pom.xml deleted file mode 100644 index fff20d1..0000000 --- a/wikinews-importer/pom.xml +++ /dev/null @@ -1,79 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-sandbox</artifactId> - <version>3.0.0-SNAPSHOT</version> - </parent> - - <artifactId>wikinews-importer</artifactId> - <packaging>jar</packaging> - - <name>Apache OpenNLP Wikinews Importer</name> - - <dependencies> - - <dependency> - <groupId>info.bliki.wiki</groupId> - <artifactId>bliki-core</artifactId> - <version>3.0.19</version> - </dependency> - - <dependency> - <groupId>org.apache.uima</groupId> - <artifactId>uimaj-core</artifactId> - <version>${uimaj.version}</version> - <scope>compile</scope> - </dependency> - - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-api</artifactId> - </dependency> - - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-engine</artifactId> - </dependency> - - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-params</artifactId> - </dependency> - </dependencies> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <configuration> - <source>${maven.compiler.source}</source> - <target>${maven.compiler.target}</target> - <compilerArgument>-Xlint</compilerArgument> - </configuration> - </plugin> - </plugins> - </build> -</project> \ No newline at end of file diff --git a/wikinews-importer/samples/FTC_begins_antitrust_inquiry_of_Google.xmi b/wikinews-importer/samples/FTC_begins_antitrust_inquiry_of_Google.xmi deleted file mode 100644 index 19340d5..0000000 --- a/wikinews-importer/samples/FTC_begins_antitrust_inquiry_of_Google.xmi +++ /dev/null @@ -1,77 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<xmi:XMI xmlns:cas="http:///uima/cas.ecore" xmlns:annotations="http:///org/apache/opennlp/annotations.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:tcas="http:///uima/tcas.ecore" xmi:version="2.0"> - <cas:NULL xmi:id="0"/> - <cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="FTC begins antitrust inquiry of Google Google has confirmed that it has "received formal notification," that the Federal Trade Commission (FTC) is investigating its business practices. The acknowledgment was posted on the internet search engine company's blog Friday. Google said it was unclear about the nature of the probe. A broad FTC investigation would cause t [...] - <tcas:DocumentAnnotation xmi:id="8" sofa="1" begin="0" end="3039" language="x-unspecified"/> - <annotations:Headline xmi:id="13" sofa="1" begin="0" end="38"/> - <annotations:Sentence xmi:id="17" sofa="1" begin="40" end="185"/> - <annotations:Sentence xmi:id="21" sofa="1" begin="186" end="326"/> - <annotations:Sentence xmi:id="25" sofa="1" begin="328" end="529"/> - <annotations:Sentence xmi:id="29" sofa="1" begin="531" end="667"/> - <annotations:Sentence xmi:id="33" sofa="1" begin="861" end="1095"/> - <annotations:Sentence xmi:id="37" sofa="1" begin="669" end="860"/> - <annotations:Sentence xmi:id="41" sofa="1" begin="2851" end="3039"/> - <annotations:Sentence xmi:id="45" sofa="1" begin="2673" end="2849"/> - <annotations:Sentence xmi:id="49" sofa="1" begin="2231" end="2443"/> - <annotations:Sentence xmi:id="53" sofa="1" begin="2444" end="2671"/> - <annotations:Sentence xmi:id="57" sofa="1" begin="2148" end="2229"/> - <annotations:Sentence xmi:id="61" sofa="1" begin="1888" end="2147"/> - <annotations:Sentence xmi:id="65" sofa="1" begin="1568" end="1722"/> - <annotations:Sentence xmi:id="69" sofa="1" begin="1723" end="1886"/> - <annotations:Sentence xmi:id="73" sofa="1" begin="1097" end="1275"/> - <annotations:Sentence xmi:id="77" sofa="1" begin="1276" end="1407"/> - <annotations:Sentence xmi:id="81" sofa="1" begin="1408" end="1566"/> - <annotations:Organization xmi:id="85" sofa="1" begin="0" end="3"/> - <annotations:Organization xmi:id="89" sofa="1" begin="32" end="38"/> - <annotations:Organization xmi:id="93" sofa="1" begin="40" end="46"/> - <annotations:Organization xmi:id="97" sofa="1" begin="114" end="144"/> - <annotations:Organization xmi:id="101" sofa="1" begin="269" end="275"/> - <annotations:Organization xmi:id="105" sofa="1" begin="336" end="339"/> - <annotations:Organization xmi:id="109" sofa="1" begin="437" end="446"/> - <annotations:Organization xmi:id="113" sofa="1" begin="513" end="522"/> - <annotations:Organization xmi:id="117" sofa="1" begin="535" end="538"/> - <annotations:Organization xmi:id="121" sofa="1" begin="663" end="666"/> - <annotations:Organization xmi:id="125" sofa="1" begin="669" end="681"/> - <annotations:Organization xmi:id="129" sofa="1" begin="715" end="718"/> - <annotations:Organization xmi:id="133" sofa="1" begin="757" end="763"/> - <annotations:Organization xmi:id="137" sofa="1" begin="861" end="867"/> - <annotations:Organization xmi:id="141" sofa="1" begin="881" end="887"/> - <annotations:Organization xmi:id="145" sofa="1" begin="1101" end="1104"/> - <annotations:Organization xmi:id="149" sofa="1" begin="1139" end="1145"/> - <annotations:Organization xmi:id="153" sofa="1" begin="1289" end="1292"/> - <annotations:Organization xmi:id="157" sofa="1" begin="1295" end="1301"/> - <annotations:Organization xmi:id="161" sofa="1" begin="1398" end="1406"/> - <annotations:Organization xmi:id="165" sofa="1" begin="1559" end="1564"/> - <annotations:Organization xmi:id="169" sofa="1" begin="1568" end="1574"/> - <annotations:Organization xmi:id="173" sofa="1" begin="1699" end="1705"/> - <annotations:Organization xmi:id="177" sofa="1" begin="1764" end="1787"/> - <annotations:Organization xmi:id="181" sofa="1" begin="1797" end="1800"/> - <annotations:Organization xmi:id="185" sofa="1" begin="1865" end="1871"/> - <annotations:Organization xmi:id="189" sofa="1" begin="1971" end="1974"/> - <annotations:Organization xmi:id="193" sofa="1" begin="2056" end="2062"/> - <annotations:Organization xmi:id="197" sofa="1" begin="2231" end="2237"/> - <annotations:Organization xmi:id="201" sofa="1" begin="2292" end="2295"/> - <annotations:Organization xmi:id="205" sofa="1" begin="2418" end="2424"/> - <annotations:Organization xmi:id="209" sofa="1" begin="2507" end="2510"/> - <annotations:Organization xmi:id="213" sofa="1" begin="2673" end="2682"/> - <annotations:Organization xmi:id="217" sofa="1" begin="2701" end="2707"/> - <annotations:Organization xmi:id="221" sofa="1" begin="2851" end="2857"/> - <annotations:Organization xmi:id="225" sofa="1" begin="3035" end="3038"/> - <annotations:Organization xmi:id="229" sofa="1" begin="2963" end="2969"/> - <annotations:Person xmi:id="233" sofa="1" begin="1732" end="1745"/> - <annotations:Person xmi:id="237" sofa="1" begin="1888" end="1902"/> - <annotations:Person xmi:id="241" sofa="1" begin="2245" end="2257"/> - <annotations:Person xmi:id="245" sofa="1" begin="2448" end="2455"/> - <annotations:Paragraph xmi:id="249" sofa="1" begin="40" end="326"/> - <annotations:Paragraph xmi:id="253" sofa="1" begin="328" end="529"/> - <annotations:Paragraph xmi:id="257" sofa="1" begin="531" end="667"/> - <annotations:Paragraph xmi:id="261" sofa="1" begin="669" end="1095"/> - <annotations:Paragraph xmi:id="265" sofa="1" begin="1097" end="1566"/> - <annotations:Paragraph xmi:id="269" sofa="1" begin="1568" end="1886"/> - <annotations:Paragraph xmi:id="273" sofa="1" begin="1888" end="2229"/> - <annotations:Paragraph xmi:id="277" sofa="1" begin="2231" end="2671"/> - <annotations:Paragraph xmi:id="281" sofa="1" begin="2673" end="2849"/> - <annotations:Paragraph xmi:id="285" sofa="1" begin="2851" end="3039"/> - <cas:View sofa="1" members="8 13 17 21 25 29 33 37 41 45 49 53 57 61 65 69 73 77 81 85 89 93 97 101 105 109 113 117 121 125 129 133 137 141 145 149 153 157 161 165 169 173 177 181 185 189 193 197 201 205 209 213 217 221 225 229 233 237 241 245 249 253 257 261 265 269 273 277 281 285"/> -</xmi:XMI> diff --git a/wikinews-importer/samples/Internet_hacking_group_LulzSec_disbands.xmi b/wikinews-importer/samples/Internet_hacking_group_LulzSec_disbands.xmi deleted file mode 100644 index 8ebfa3d..0000000 --- a/wikinews-importer/samples/Internet_hacking_group_LulzSec_disbands.xmi +++ /dev/null @@ -1,23 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<xmi:XMI xmlns:cas="http:///uima/cas.ecore" xmlns:annotations="http:///org/apache/opennlp/annotations.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:tcas="http:///uima/tcas.ecore" xmi:version="2.0"> - <cas:NULL xmi:id="0"/> - <cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="Internet hacking group LulzSec disbands The computer hacking organization Lulz Security disbanded yesterday, said the group in a statement. Released via Pastebin, it states "[o]ur planned 50 day cruise has expired, and we must now sail into the distance." The announcement comes a day after The Guardian released leaked IRC logs of private conversations between Lul [...] - <tcas:DocumentAnnotation xmi:id="8" sofa="1" begin="0" end="1286" language="x-unspecified"/> - <annotations:Headline xmi:id="13" sofa="1" begin="0" end="39"/> - <annotations:Paragraph xmi:id="17" sofa="1" begin="41" end="256"/> - <annotations:Paragraph xmi:id="21" sofa="1" begin="258" end="455"/> - <annotations:Paragraph xmi:id="25" sofa="1" begin="457" end="748"/> - <annotations:Paragraph xmi:id="29" sofa="1" begin="750" end="1167"/> - <annotations:Paragraph xmi:id="33" sofa="1" begin="1169" end="1285"/> - <annotations:Sentence xmi:id="37" sofa="1" begin="141" end="256"/> - <annotations:Sentence xmi:id="41" sofa="1" begin="41" end="140"/> - <annotations:Sentence xmi:id="45" sofa="1" begin="258" end="455"/> - <annotations:Sentence xmi:id="49" sofa="1" begin="666" end="748"/> - <annotations:Sentence xmi:id="53" sofa="1" begin="537" end="665"/> - <annotations:Sentence xmi:id="57" sofa="1" begin="457" end="536"/> - <annotations:Sentence xmi:id="61" sofa="1" begin="750" end="856"/> - <annotations:Sentence xmi:id="65" sofa="1" begin="857" end="959"/> - <annotations:Sentence xmi:id="69" sofa="1" begin="960" end="1167"/> - <annotations:Sentence xmi:id="73" sofa="1" begin="1169" end="1285"/> - <cas:View sofa="1" members="8 13 17 21 25 29 33 37 41 45 49 53 57 61 65 69 73"/> -</xmi:XMI> diff --git a/wikinews-importer/samples/Obama_announces_troop_reductions_in_Afghanistan.xmi b/wikinews-importer/samples/Obama_announces_troop_reductions_in_Afghanistan.xmi deleted file mode 100644 index ad4f430..0000000 --- a/wikinews-importer/samples/Obama_announces_troop_reductions_in_Afghanistan.xmi +++ /dev/null @@ -1,24 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<xmi:XMI xmlns:cas="http:///uima/cas.ecore" xmlns:annotations="http:///org/apache/opennlp/annotations.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:tcas="http:///uima/tcas.ecore" xmi:version="2.0"> - <cas:NULL xmi:id="0"/> - <cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="Obama announces troop reductions in Afghanistan A third of U.S. forces in Afghanistan are to be withdrawn from the country by the end of next year, president Barack Obama has announced. In a televised statement on Wednesday evening, Obama announced 33,000 soldiers would be withdrawn from the country by the summer of next year, and declared the U.S. had beaten al-Qaeda and t [...] - <tcas:DocumentAnnotation xmi:id="8" sofa="1" begin="0" end="1807" language="x-unspecified"/> - <annotations:Headline xmi:id="13" sofa="1" begin="0" end="47"/> - <annotations:Paragraph xmi:id="17" sofa="1" begin="49" end="388"/> - <annotations:Paragraph xmi:id="21" sofa="1" begin="390" end="807"/> - <annotations:Paragraph xmi:id="25" sofa="1" begin="809" end="1259"/> - <annotations:Paragraph xmi:id="29" sofa="1" begin="1261" end="1484"/> - <annotations:Paragraph xmi:id="33" sofa="1" begin="1486" end="1646"/> - <annotations:Paragraph xmi:id="37" sofa="1" begin="1648" end="1806"/> - <annotations:Sentence xmi:id="41" sofa="1" begin="49" end="186"/> - <annotations:Sentence xmi:id="45" sofa="1" begin="187" end="388"/> - <annotations:Sentence xmi:id="49" sofa="1" begin="390" end="626"/> - <annotations:Sentence xmi:id="53" sofa="1" begin="627" end="807"/> - <annotations:Sentence xmi:id="57" sofa="1" begin="809" end="1058"/> - <annotations:Sentence xmi:id="61" sofa="1" begin="1059" end="1259"/> - <annotations:Sentence xmi:id="65" sofa="1" begin="1261" end="1484"/> - <annotations:Sentence xmi:id="69" sofa="1" begin="1587" end="1646"/> - <annotations:Sentence xmi:id="73" sofa="1" begin="1486" end="1586"/> - <annotations:Sentence xmi:id="77" sofa="1" begin="1648" end="1806"/> - <cas:View sofa="1" members="8 13 17 21 25 29 33 37 41 45 49 53 57 61 65 69 73 77"/> -</xmi:XMI> diff --git "a/wikinews-importer/samples/Ratko_Mladi\304\207_said_to_be_too_ill_to_face_trial.xmi" "b/wikinews-importer/samples/Ratko_Mladi\304\207_said_to_be_too_ill_to_face_trial.xmi" deleted file mode 100644 index cfa560b..0000000 --- "a/wikinews-importer/samples/Ratko_Mladi\304\207_said_to_be_too_ill_to_face_trial.xmi" +++ /dev/null @@ -1,28 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<xmi:XMI xmlns:cas="http:///uima/cas.ecore" xmlns:annotations="http:///org/apache/opennlp/annotations.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:tcas="http:///uima/tcas.ecore" xmi:version="2.0"> - <cas:NULL xmi:id="0"/> - <cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="Ratko Mladić said to be too ill to face trial Ratko Mladić, a former Bosnian Serb general, is allegedly too ill to face trial for war crimes. According to his lawyer, 69-year-old Mladić will not survive to see the start of proceedings. Concerns for the health of Mladić come despite a Serbian judge having ruled him fit for extradition to the UN war crimes tribunal in The Ha [...] - <tcas:DocumentAnnotation xmi:id="8" sofa="1" begin="0" end="1618" language="x-unspecified"/> - <annotations:Headline xmi:id="13" sofa="1" begin="0" end="45"/> - <annotations:Paragraph xmi:id="17" sofa="1" begin="47" end="236"/> - <annotations:Paragraph xmi:id="21" sofa="1" begin="238" end="441"/> - <annotations:Paragraph xmi:id="25" sofa="1" begin="443" end="593"/> - <annotations:Paragraph xmi:id="29" sofa="1" begin="595" end="722"/> - <annotations:Paragraph xmi:id="33" sofa="1" begin="724" end="906"/> - <annotations:Paragraph xmi:id="37" sofa="1" begin="908" end="1173"/> - <annotations:Paragraph xmi:id="45" sofa="1" begin="1483" end="1617"/> - <annotations:Paragraph xmi:id="101" sofa="1" begin="1175" end="1481"/> - <annotations:Sentence xmi:id="49" sofa="1" begin="143" end="236"/> - <annotations:Sentence xmi:id="53" sofa="1" begin="47" end="142"/> - <annotations:Sentence xmi:id="57" sofa="1" begin="238" end="441"/> - <annotations:Sentence xmi:id="61" sofa="1" begin="443" end="593"/> - <annotations:Sentence xmi:id="65" sofa="1" begin="595" end="722"/> - <annotations:Sentence xmi:id="69" sofa="1" begin="850" end="906"/> - <annotations:Sentence xmi:id="73" sofa="1" begin="724" end="849"/> - <annotations:Sentence xmi:id="77" sofa="1" begin="908" end="1173"/> - <annotations:Sentence xmi:id="81" sofa="1" begin="1395" end="1481"/> - <annotations:Sentence xmi:id="85" sofa="1" begin="1277" end="1394"/> - <annotations:Sentence xmi:id="89" sofa="1" begin="1175" end="1276"/> - <annotations:Sentence xmi:id="93" sofa="1" begin="1483" end="1617"/> - <cas:View sofa="1" members="8 13 17 21 25 29 33 37 45 101 49 53 57 61 65 69 73 77 81 85 89 93"/> -</xmi:XMI> diff --git a/wikinews-importer/samples/TypeSystem.xml b/wikinews-importer/samples/TypeSystem.xml deleted file mode 100644 index 75180a4..0000000 --- a/wikinews-importer/samples/TypeSystem.xml +++ /dev/null @@ -1,106 +0,0 @@ -<?xml version="1.0" encoding="UTF-8" ?> - - <!-- - *************************************************************** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - *************************************************************** - --> - -<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier"> - <name>WikinewsTypeSystem</name> - <description>Wikinews Sample Type System Definition</description> - <vendor>The Apache Software Foundation</vendor> - <version>1.0</version> - <types> - <typeDescription> - <name>org.apache.opennlp.annotations.Headline</name> - <description></description> - <supertypeName>uima.tcas.Annotation</supertypeName> - </typeDescription> - - <typeDescription> - <name>org.apache.opennlp.annotations.SubHeadline</name> - <description></description> - <supertypeName>uima.tcas.Annotation</supertypeName> - </typeDescription> - - <typeDescription> - <name>org.apache.opennlp.annotations.Paragraph</name> - <description></description> - <supertypeName>uima.tcas.Annotation</supertypeName> - </typeDescription> - - <typeDescription> - <name>org.apache.opennlp.annotations.Sentence</name> - <description></description> - <supertypeName>uima.tcas.Annotation</supertypeName> - </typeDescription> - - <typeDescription> - <name>org.apache.opennlp.annotations.Token</name> - <description></description> - <supertypeName>uima.tcas.Annotation</supertypeName> - </typeDescription> - - <typeDescription> - <name>org.apache.opennlp.annotations.Person</name> - <description></description> - <supertypeName>uima.tcas.Annotation</supertypeName> - </typeDescription> - - <typeDescription> - <name>org.apache.opennlp.annotations.Organization</name> - <description></description> - <supertypeName>uima.tcas.Annotation</supertypeName> - </typeDescription> - - <typeDescription> - <name>org.apache.opennlp.annotations.WikiLink</name> - <supertypeName>uima.tcas.Annotation</supertypeName> - <features> - <featureDescription> - <name>link</name> - <description></description> - <rangeTypeName>uima.cas.String</rangeTypeName> - </featureDescription> - </features> - </typeDescription> - <typeDescription> - <name>bumblebee.annotations.AnnotationStatus</name> - <supertypeName>uima.tcas.Annotation</supertypeName> - <features> - <featureDescription> - <name>sentence</name> - <rangeTypeName>uima.cas.Boolean</rangeTypeName> - </featureDescription> - <featureDescription> - <name>token</name> - <rangeTypeName>uima.cas.Boolean</rangeTypeName> - </featureDescription> - <featureDescription> - <name>person</name> - <rangeTypeName>uima.cas.Boolean</rangeTypeName> - </featureDescription> - <featureDescription> - <name>organization</name> - <rangeTypeName>uima.cas.Boolean</rangeTypeName> - </featureDescription> - </features> - </typeDescription> - </types> -</typeSystemDescription> \ No newline at end of file diff --git a/wikinews-importer/samples/US_actor_Peter_Falk_dies_aged_83.xmi b/wikinews-importer/samples/US_actor_Peter_Falk_dies_aged_83.xmi deleted file mode 100644 index 4177b48..0000000 --- a/wikinews-importer/samples/US_actor_Peter_Falk_dies_aged_83.xmi +++ /dev/null @@ -1,22 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<xmi:XMI xmlns:cas="http:///uima/cas.ecore" xmlns:annotations="http:///org/apache/opennlp/annotations.ecore" xmlns:xmi="http://www.omg.org/XMI" xmlns:tcas="http:///uima/tcas.ecore" xmi:version="2.0"> - <cas:NULL xmi:id="0"/> - <cas:Sofa xmi:id="1" sofaNum="1" sofaID="_InitialView" mimeType="text" sofaString="US actor Peter Falk dies aged 83 US actor Peter Falk died on Thursday at his home in Beverly Hills, California after a struggle with Alzheimer's disease. Falk may be most known for his role as Detective Columbo in the television series of the same name that ran on NBC from 1968 to 1978, which moved to ABC in 1989. The last episode aired in 2003. His portrayal of the character w [...] - <tcas:DocumentAnnotation xmi:id="8" sofa="1" begin="0" end="902" language="x-unspecified"/> - <annotations:Headline xmi:id="13" sofa="1" begin="0" end="32"/> - <annotations:Paragraph xmi:id="17" sofa="1" begin="34" end="405"/> - <annotations:Paragraph xmi:id="21" sofa="1" begin="407" end="636"/> - <annotations:Paragraph xmi:id="25" sofa="1" begin="638" end="852"/> - <annotations:Paragraph xmi:id="29" sofa="1" begin="854" end="901"/> - <annotations:Sentence xmi:id="33" sofa="1" begin="554" end="636"/> - <annotations:Sentence xmi:id="37" sofa="1" begin="34" end="154"/> - <annotations:Sentence xmi:id="41" sofa="1" begin="155" end="316"/> - <annotations:Sentence xmi:id="45" sofa="1" begin="317" end="348"/> - <annotations:Sentence xmi:id="49" sofa="1" begin="349" end="405"/> - <annotations:Sentence xmi:id="53" sofa="1" begin="407" end="511"/> - <annotations:Sentence xmi:id="57" sofa="1" begin="512" end="553"/> - <annotations:Sentence xmi:id="61" sofa="1" begin="638" end="746"/> - <annotations:Sentence xmi:id="65" sofa="1" begin="747" end="852"/> - <annotations:Sentence xmi:id="69" sofa="1" begin="854" end="901"/> - <cas:View sofa="1" members="8 13 17 21 25 29 33 37 41 45 49 53 57 61 65 69"/> -</xmi:XMI> diff --git a/wikinews-importer/samples/wikinews.xml b/wikinews-importer/samples/wikinews.xml deleted file mode 100644 index eb98409..0000000 --- a/wikinews-importer/samples/wikinews.xml +++ /dev/null @@ -1,107 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<fields xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:noNamespaceSchemaLocation="../../main/resources/lucas.xsd"> - - <field name="headline" index="yes" stored="no"> - <annotations> - <annotation type="org.apache.opennlp.annotations.Headline" tokenizer="standard"> - <filters> - <filter name="lowercase"/> - </filters> - </annotation> - </annotations> - </field> - - <field name="text" index="yes" stored="no"> - <annotations> - <annotation type="uima.tcas.DocumentAnnotation" tokenizer="standard"> - <filters> - <filter name="lowercase"/> - </filters> - </annotation> - - </annotations> - </field> - - <field name="person" index="yes" stored="no"> - <annotations> - <annotation type="org.apache.opennlp.annotations.Person" tokenizer="standard"> - <filters> - <filter name="lowercase"/> - </filters> - </annotation> - - </annotations> - </field> - - <field name="organization" index="yes" stored="no"> - <annotations> - <annotation type="org.apache.opennlp.annotations.Organization" tokenizer="standard"> - <filters> - <filter name="lowercase"/> - </filters> - </annotation> - - </annotations> - </field> - - <field name="status-sentence" index="yes" stored="no" termVector="no"> - <annotations> - <annotation type="bumblebee.annotations.AnnotationStatus" tokenizer="standard"> - <features> - <feature name="sentence"/> - </features> - </annotation> - </annotations> - </field> - - <field name="status-token" index="yes" stored="no" termVector="no"> - <annotations> - <annotation type="bumblebee.annotations.AnnotationStatus"> - <features> - <feature name="token"/> - </features> - </annotation> - </annotations> - </field> - - <field name="status-person" index="yes" stored="yes"> - <annotations> - <annotation type="bumblebee.annotations.AnnotationStatus"> - <features> - <feature name="token"/> - </features> - </annotation> - </annotations> - </field> - - <field name="status-organization" index="yes"> - <annotations> - <annotation type="bumblebee.annotations.AnnotationStatus"> - <features> - <feature name="token"/> - </features> - </annotation> - </annotations> - </field> -</fields> diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java deleted file mode 100644 index 2624ae7..0000000 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.wikinews_importer; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import info.bliki.htmlcleaner.ContentToken; -import info.bliki.htmlcleaner.TagNode; -import info.bliki.wiki.filter.ITextConverter; -import info.bliki.wiki.filter.WPList; -import info.bliki.wiki.filter.WPTable; -import info.bliki.wiki.model.Configuration; -import info.bliki.wiki.model.IWikiModel; -import info.bliki.wiki.model.ImageFormat; -import info.bliki.wiki.model.WikiModel; -import info.bliki.wiki.tags.WPATag; - -/** - * Parse mediawiki markup to strip the formatting info and extract a simple text - * version suitable for NLP along with header, paragraph and link position - * annotations. - * <p> - * Use the {@code #convert(String)} and {@code #getWikiLinks()} methods. - * <p> - * Due to the constraints imposed by the {@link ITextConverter} / - * {@link WikiModel} API, this class is not thread safe: only one instance - * should be run by thread. - */ -public class AnnotatingMarkupParser implements ITextConverter { - - private static final String HREF_ATTR_KEY = "href"; - - private static final String WIKILINK_TITLE_ATTR_KEY = "title"; - private static final String WIKILINK_TARGET_ATTR_KEY = "href"; - private static final String WIKIOBJECT_ATTR_KEY = "wikiobject"; - - private static final Set<String> PARAGRAPH_TAGS = Set.of("p"); - private static final Set<String> HEADING_TAGS = Set.of("h1", "h2", "h3", "h4", "h5", "h6"); - - private final List<Annotation> wikilinks = new ArrayList<>(); - private final List<Annotation> headers = new ArrayList<>(); - private final List<Annotation> paragraphs = new ArrayList<>(); - - private String languageCode = "en"; - - private final WikiModel model; - - private String redirect; - - private String text; - - public AnnotatingMarkupParser() { - model = makeWikiModel(languageCode); - } - - public AnnotatingMarkupParser(String languageCode) { - this.languageCode = languageCode; - model = makeWikiModel(languageCode); - } - - public WikiModel makeWikiModel(String langCode) { - return new WikiModel(String.format("https:/%s.wikipedia.org/wiki/${image}", langCode), - String.format("https://%s.wikipedia.org/wiki/${title}", langCode)) { - @Override - public String getRawWikiContent(String namespace, - String articleName, Map<String, String> templateParameters) { - // disable template support - // TODO: we need to re-add template support at least for dates - return ""; - } - }; - } - - @Override - public void nodesToText(List<?> nodes, Appendable buffer, IWikiModel model) throws IOException { - CountingAppendable countingBuffer; - if (buffer instanceof CountingAppendable) { - countingBuffer = (CountingAppendable) buffer; - } else { - // wrap - countingBuffer = new CountingAppendable(buffer); - } - - if (nodes != null && !nodes.isEmpty()) { - try { - int level = model.incrementRecursionLevel(); - if (level > Configuration.RENDERER_RECURSION_LIMIT) { - countingBuffer.append("Error - recursion limit exceeded" - + " rendering tags in PlainTextConverter#nodesToText()."); - return; - } - for (Object node : nodes) { - if (node instanceof WPATag tag) { - // extract wikilink annotations - String wikilinkLabel = tag.getAttributes().get(WIKILINK_TITLE_ATTR_KEY); - String wikilinkTarget = tag.getAttributes().get(WIKILINK_TARGET_ATTR_KEY); - if (wikilinkLabel != null) { - int colonIdx = -1; // wikilinkLabel.indexOf(':'); - if (colonIdx == -1) { - // do not serialize non-topic wiki-links such as - // translation links missing from the - // INTERWIKI_LINK map - int start = countingBuffer.currentPosition; - tag.getBodyString(countingBuffer); - int end = countingBuffer.currentPosition; - if (!wikilinkTarget.startsWith("#")) { - // TODO: wikilink label is not important,since that is the covered text? - wikilinks.add(new Annotation(start, end, wikilinkLabel, wikilinkTarget)); - } - } - } else { - tag.getBodyString(countingBuffer); - } - - } else if (node instanceof ContentToken contentToken) { - countingBuffer.append(contentToken.getContent()); - } else if (node instanceof List) { - } else if (node instanceof WPList) { - } else if (node instanceof WPTable) { - // ignore lists and tables since they most of the time - // do not hold grammatically correct - // interesting sentences that are representative of the - // language. - } else if (node instanceof TagNode tagNode) { - Map<String, String> attributes = tagNode.getAttributes(); - Map<String, Object> oAttributes = tagNode.getObjectAttributes(); - boolean hasSpecialHandling = false; - String tagName = tagNode.getName(); - int tagBegin = countingBuffer.currentPosition; - - if ("ref".equals(tagName)) { - // ignore the references since they do not hold - // interesting text content - hasSpecialHandling = true; - } else if (oAttributes != null - && oAttributes.get(WIKIOBJECT_ATTR_KEY) instanceof ImageFormat) { - // the caption of images often holds well-formed - // sentences with links to entities - hasSpecialHandling = true; - ImageFormat iformat = (ImageFormat) oAttributes.get(WIKIOBJECT_ATTR_KEY); - imageNodeToText(tagNode, iformat, countingBuffer, model); - } - if (!hasSpecialHandling) { - nodesToText(tagNode.getChildren(), countingBuffer, model); - } - if (PARAGRAPH_TAGS.contains(tagName)) { - paragraphs.add(new Annotation(tagBegin, - countingBuffer.currentPosition, "paragraph", tagName)); - countingBuffer.append("\n\n"); - } else if (HEADING_TAGS.contains(tagName)) { - headers.add(new Annotation(tagBegin, - countingBuffer.currentPosition, "heading", tagName)); - countingBuffer.append("\n\n"); - } else if ("a".equals(tagName)) { - String href = attributes.get(HREF_ATTR_KEY); - - // TODO: How to get covered text here? Is not needed anyway right?! - wikilinks.add(new Annotation(tagBegin, countingBuffer.currentPosition, - "", href)); - } - - } - } - } finally { - model.decrementRecursionLevel(); - } - } - } - - @Override - public void imageNodeToText(TagNode tagNode, ImageFormat imageFormat, - Appendable buffer, IWikiModel model) throws IOException { -// nodesToText(tagNode.getChildren(), buffer, model); - } - - @Override - public boolean noLinks() { - return true; - } - - public List<Annotation> getWikiLinkAnnotations() { - return wikilinks; - } - - public List<Annotation> getHeaderAnnotations() { - return headers; - } - - public List<Annotation> getParagraphAnnotations() { - return paragraphs; - } - - public List<String> getParagraphs() { - List<String> texts = new ArrayList<>(); - for (Annotation p : paragraphs) { - texts.add(text.substring(p.begin, p.end)); - } - return texts; - } - - public List<String> getHeaders() { - List<String> texts = new ArrayList<>(); - for (Annotation h : headers) { - texts.add(text.substring(h.begin, h.end)); - } - return texts; - } - - public String getRedirect() { - return redirect; - } - - public static class CountingAppendable implements Appendable { - - public int currentPosition = 0; - - final protected Appendable wrappedBuffer; - - public CountingAppendable(Appendable wrappedBuffer) { - this.wrappedBuffer = wrappedBuffer; - } - - @Override - public Appendable append(CharSequence charSeq) throws IOException { - currentPosition += charSeq.length(); - return wrappedBuffer.append(charSeq); - } - - @Override - public Appendable append(char aChar) throws IOException { - currentPosition += 1; - return wrappedBuffer.append(aChar); - } - - @Override - public Appendable append(CharSequence charSeq, int start, int end) throws IOException { - currentPosition += end - start; - return wrappedBuffer.append(charSeq, start, end); - } - - } - -} diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java deleted file mode 100644 index 7be95fa..0000000 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/Annotation.java +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.wikinews_importer; - -public class Annotation { - - public final int begin; - - public final int end; - - public final String label; - - public final String value; - - public Annotation(int start, int end, String label, String value) { - this.begin = start; - this.end = end; - this.label = label; - this.value = value; - } - -} diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java deleted file mode 100644 index 3039e10..0000000 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/UimaUtil.java +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.wikinews_importer; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -import javax.xml.parsers.ParserConfigurationException; -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import org.apache.uima.ResourceSpecifierFactory; -import org.apache.uima.UIMAFramework; -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.impl.XmiCasDeserializer; -import org.apache.uima.cas.impl.XmiCasSerializer; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.resource.metadata.FsIndexDescription; -import org.apache.uima.resource.metadata.TypePriorities; -import org.apache.uima.resource.metadata.TypeSystemDescription; -import org.apache.uima.resource.metadata.impl.FsIndexDescription_impl; -import org.apache.uima.util.CasCreationUtils; -import org.apache.uima.util.InvalidXMLException; -import org.apache.uima.util.XMLInputSource; -import org.apache.uima.util.XMLParser; -import org.apache.uima.util.XMLSerializer; -import org.xml.sax.SAXException; - -public class UimaUtil { - - static TypeSystemDescription createTypeSystemDescription(InputStream in) { - - // Note: - // Type System location is not set correctly, - // resolving a referenced type system will fail - - XMLInputSource xmlTypeSystemSource = new XMLInputSource(in, new File("")); - - XMLParser xmlParser = UIMAFramework.getXMLParser(); - - TypeSystemDescription typeSystemDescriptor; - - try { - typeSystemDescriptor = (TypeSystemDescription) xmlParser.parse(xmlTypeSystemSource); - - typeSystemDescriptor.resolveImports(); - } catch (InvalidXMLException e) { - e.printStackTrace(); - typeSystemDescriptor = null; - } - - return typeSystemDescriptor; - } - - static CAS createEmptyCAS(TypeSystemDescription typeSystem) { - ResourceSpecifierFactory resourceSpecifierFactory = UIMAFramework - .getResourceSpecifierFactory(); - TypePriorities typePriorities = resourceSpecifierFactory - .createTypePriorities(); - - FsIndexDescription indexDesciptor = new FsIndexDescription_impl(); - indexDesciptor.setLabel("TOPIndex"); - indexDesciptor.setTypeName("uima.cas.TOP"); - indexDesciptor.setKind(FsIndexDescription.KIND_SORTED); - - CAS cas; - try { - cas = CasCreationUtils.createCas(typeSystem, typePriorities, - new FsIndexDescription[] { indexDesciptor }); - } catch (ResourceInitializationException e) { - e.printStackTrace(); - cas = null; - } - - return cas; - } - - static void deserializeXmiCAS(CAS cas, InputStream xmiIn) throws IOException { - - SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); - saxParserFactory.setValidating(false); - - SAXParser saxParser; - - try { - saxParser = saxParserFactory.newSAXParser(); - } catch (ParserConfigurationException e) { - throw new IllegalStateException( - "SAXParser should be configured correctly!", e); - } catch (SAXException e) { - throw new IllegalStateException("SAX error while creating parser!", e); - } - - XmiCasDeserializer deserializer = new XmiCasDeserializer(cas.getTypeSystem()); - - try { - saxParser.parse(xmiIn, deserializer.getXmiCasHandler(cas)); - } catch (SAXException e) { - throw new IOException("Invalid XMI input!", e); - } - } - - static void serializeCASToXmi(CAS cas, OutputStream out) throws IOException { - XmiCasSerializer xmiSerializer = new XmiCasSerializer(cas.getTypeSystem()); - - XMLSerializer xmlSerialzer = new XMLSerializer(out, true); - - try { - xmiSerializer.serialize(cas, xmlSerialzer.getContentHandler()); - } catch (SAXException e) { - e.printStackTrace(); - } - } -} diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java deleted file mode 100644 index 9c03e74..0000000 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsConverter.java +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.wikinews_importer; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.List; - -import info.bliki.wiki.dump.IArticleFilter; -import info.bliki.wiki.dump.Siteinfo; -import info.bliki.wiki.dump.WikiArticle; -import info.bliki.wiki.dump.WikiXMLParser; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.cas.text.AnnotationFS; -import org.apache.uima.resource.metadata.TypeSystemDescription; -import org.xml.sax.SAXException; - -/** - * Demo application which reads an uncompressed Wikipedia XML dump - * file and writes each article as an XMI file. - */ -public class WikinewsConverter { - - static class CASArticleFilter implements IArticleFilter { - - private final TypeSystemDescription tsDesc; - private final File outputFolder; - private final List<String> endOfArticleMarkers = new ArrayList<>(); - - CASArticleFilter(TypeSystemDescription tsDesc, File outputFolder) { - - this.tsDesc = tsDesc; - this.outputFolder = outputFolder; - - endOfArticleMarkers.add("{{haveyoursay}}"); - endOfArticleMarkers.add("== Sources =="); - endOfArticleMarkers.add("==Sources=="); - endOfArticleMarkers.add("== Source =="); - endOfArticleMarkers.add("==Source=="); - endOfArticleMarkers.add("==References=="); - endOfArticleMarkers.add("== References =="); - endOfArticleMarkers.add("=== References==="); - } - - public static String titleToUri(String title) { - return URLEncoder.encode(title.replaceAll(" ", "_"), StandardCharsets.UTF_8); - } - - @Override - public void process(WikiArticle page, Siteinfo siteinfo) - throws SAXException { - - if (page.getIntegerNamespace() == 0 && page.isMain()) { - if (page.getText().toLowerCase().contains("{publish}")) { - - String pageText = page.getText(); - int cutIndex = pageText.length(); - - for (String endMarker : endOfArticleMarkers) { - int endMarkerIndex = pageText.indexOf(endMarker); - if (endMarkerIndex != -1 && endMarkerIndex < cutIndex) { - cutIndex = endMarkerIndex; - } - } - - if (cutIndex < pageText.length()) { - pageText = pageText.substring(0, cutIndex); - } - - WikinewsWikiModel wikiModel = new WikinewsWikiModel( - "https://en.wikinews.org/wiki/${image}", - "https://en.wikinews.org/wiki/${title}"); - - AnnotatingMarkupParser converter = new AnnotatingMarkupParser(); - String plainStr = wikiModel.render(converter, pageText); - - CAS articleCAS = UimaUtil.createEmptyCAS(tsDesc); - - // TODO: find a way to nicely add title .. - StringBuilder articleText = new StringBuilder(); - articleText.append(page.getTitle()); - - int endOffsetTitle = articleText.length(); - - articleText.append("\n"); - articleText.append("\n"); - - int bodyOffset = articleText.length(); - - articleText.append(plainStr); // Note: Add offset to annotations ... by this - - articleCAS.setDocumentLanguage("en"); - articleCAS.setDocumentText(articleText.toString()); - - AnnotationFS headlineAnnotation = articleCAS.createAnnotation(articleCAS.getTypeSystem() - .getType("org.apache.opennlp.annotations.Headline"), - 0, endOffsetTitle); - - articleCAS.addFsToIndexes(headlineAnnotation); - - for (Annotation paraAnn : converter.getParagraphAnnotations()) { - AnnotationFS paraAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem() - .getType("org.apache.opennlp.annotations.Paragraph"), - bodyOffset + paraAnn.begin, bodyOffset + paraAnn.end); - - articleCAS.addFsToIndexes(paraAnnFS); - } - - for (Annotation subHeadAnn : converter.getHeaderAnnotations()) { - AnnotationFS subHeadAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem() - .getType("org.apache.opennlp.annotations.SubHeadline"), - bodyOffset + subHeadAnn.begin, bodyOffset + subHeadAnn.end); - - articleCAS.addFsToIndexes(subHeadAnnFS); - } - - Type wikiLinkType = articleCAS.getTypeSystem() - .getType("org.apache.opennlp.annotations.WikiLink"); - Feature linkFeature = wikiLinkType.getFeatureByBaseName("link"); - - for (Annotation wikiLinkAnn : converter.getWikiLinkAnnotations()) { - AnnotationFS wikiLinkAnnFS = articleCAS.createAnnotation(articleCAS.getTypeSystem() - .getType("org.apache.opennlp.annotations.WikiLink"), - bodyOffset + wikiLinkAnn.begin, bodyOffset + wikiLinkAnn.end); - - wikiLinkAnnFS.setStringValue(linkFeature, wikiLinkAnn.value); - - articleCAS.addFsToIndexes(wikiLinkAnnFS); - } - - CAS markupCas = articleCAS.createView("WikiMarkup"); - markupCas.setDocumentText(page.toString()); - - // now serialize CAS - try (OutputStream casOut = new FileOutputStream(outputFolder.getAbsolutePath() + - File.separator + titleToUri(page.getTitle()) + ".xmi")) { - - UimaUtil.serializeCASToXmi(articleCAS, casOut); - } - catch (IOException e) { - e.printStackTrace(); - } - } - } - } - } - - public static void main(String[] args) throws Exception { - if (args.length != 2) { - System.err.println("Usage: Parser <XML-File> <Output-Folder>"); - System.exit(-1); - } - - // TODO: Should to be configurable! - TypeSystemDescription tsDesc = UimaUtil.createTypeSystemDescription( - new FileInputStream("samples/TypeSystem.xml")); - - File outputFolder = new File(args[1]); - outputFolder.mkdirs(); - - String bz2Filename = args[0]; - try { - IArticleFilter handler = new CASArticleFilter(tsDesc, new File(args[1])); - WikiXMLParser wxp = new WikiXMLParser(bz2Filename, handler); - wxp.parse(); - } catch (Exception e) { - System.out.println("Parsing the corpus failed:"); - System.out.println(); - e.printStackTrace(); - } - } -} diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java deleted file mode 100644 index df1592f..0000000 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/WikinewsWikiModel.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.opennlp.wikinews_importer; - -import java.util.Map; - -import info.bliki.wiki.model.WikiModel; - -public class WikinewsWikiModel extends WikiModel { - - public WikinewsWikiModel(String imageBaseURL, String linkBaseURL) { - super(imageBaseURL, linkBaseURL); - } - - @Override - public String getRawWikiContent(String namespace, String articleName, - Map<String, String> map) { - - String result = super.getRawWikiContent(namespace, articleName, map); - if (result == null) { - - // Maybe use special handling for date, ... ?! - - if (articleName.equals("w")) - return map.get("1"); - - return ""; - } - else { - return result; - } - } -}
