Skip to content

Instantly share code, notes, and snippets.

@jacopofar
Created May 14, 2013 12:26
Show Gist options
  • Save jacopofar/5575520 to your computer and use it in GitHub Desktop.
Save jacopofar/5575520 to your computer and use it in GitHub Desktop.
Extract Italian words POS tags froma Wiktionary XML dump
package generazione;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import org.xml.sax.SAXException;
public class GeneraDatabasePOS {
private static BufferedWriter wr;
/**
* Small tool to extract a list of word roles from a dump of en.wiktionary
* It's made for Italian but should work for other languages with mimimal changes.
* Example output:
*
* It reads the XML file line per line and parse with regular expressions, without using SAX
* @throws SAXException
* @throws ParserConfigurationException
* @throws IOException
*/
public static void main(String[] args) throws IOException {
String strLine;
String title="";
String content="";
boolean insideArticle=false;
File file = new File("dizionario.txt");
DataOutputStream out = new DataOutputStream(new FileOutputStream(file));
wr=new BufferedWriter(new OutputStreamWriter(out));
//parameters to inform the user about the work status
long start=System.currentTimeMillis();
long lastMessage=start;
int saw=0;
//open the en.wiktionary dump and read it line per line
FileInputStream fstream = new FileInputStream("/home/hari/Documents/itdict/enwiktionary-20130213-pages-articles.xml");
DataInputStream in = new DataInputStream(fstream);
BufferedReader br = new BufferedReader(new InputStreamReader(in));
while ((strLine = br.readLine()) != null){
if(lastMessage+5000<System.currentTimeMillis()){
System.out.println("I saw "+saw+" pages ("+Math.round(((double)saw/(double)(System.currentTimeMillis()-start))*1000)+" pages per second)");
lastMessage=System.currentTimeMillis();
}
if(strLine.contains("<title>")){
title=strLine.split("<title>")[1].split("</title>")[0];
}
if(strLine.contains("<text ")){
insideArticle=true;
content="";
try{content=strLine.split(">")[1];}catch(ArrayIndexOutOfBoundsException e){}
continue;
}
if(strLine.contains("</text>")){
insideArticle=false;
content+=strLine.split("<")[0];
elabora(content,title);
saw++;
continue;
}
if(insideArticle)content+=strLine;
}
wr.close();
out.close();
System.out.println("Finished! It took "+(System.currentTimeMillis()-start)/1000 +" seconds");
}
private static void elabora(String contenuto, String titolo) throws IOException {
if(!contenuto.contains("==Italian==")) return;
String voceIt=contenuto.split("==Italian==")[1];
voceIt=voceIt.split("[^=]==[^=]")[0];
Matcher m = Pattern.compile("===([a-zA-Z ]+)===").matcher(voceIt);
String add="";
while (m.find()) {
String maybe=m.group().toLowerCase().replace("=", "");
if(maybe.equals("adverb") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("adjective") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("noun") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("verb") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("proper noun") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("conjunction") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("verb form") && !add.contains(maybe))add+=","+"verb"; //don't care if it's a form. Moreover, is not always used
if(maybe.equals("pronoun") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("article") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("preposition") && !add.contains(maybe))add+=","+maybe;
if(maybe.equals("interjection") && !add.contains(maybe))add+=","+maybe;
}
if(add.length()>0){
wr.write(titolo+"\t");
add=add.substring(1);
//System.out.println(titolo+"-->"+add);
wr.write(add);
wr.write("\n");
}
else
System.out.println("--don't understand the word type (POS) of: "+titolo);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment