Forked from agaelebe/Reuters21578_sgml_to_txt.rb
Last active
August 29, 2015 14:06
-
-
Save m3mike/004c38c4eebdf8ca6563 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# =Split Reuters-21578 | |
# =(Found at: http://www.daviddlewis.com/resources/testcollections/reuters21578/) | |
# =SGML files into separate TXT files | |
# | |
# Documents selected are those from LEWIS SPLIT that have at least one topic. | |
# Documents (only the body of text) are put in directories according to their type (train/test) and topic. | |
# Documents with more than one topic are written in more than one folder. | |
# Only documents that have a topic listed in 'used_topics' Array are selected. | |
# This pre-processing is useful for text categorization applications. | |
# | |
# Author: Hugo.Borges _at_ gmail | |
# | |
# Release date: 2008 09 02 | |
require 'libxml' | |
require 'iconv' | |
reuters_dir = "reuters21578" # location of the sgml files to process | |
write_dir = "reuters21578txt" # write dir | |
main_topics = %w{commodities metals financial energy} # name of txt files containing the topics that will be used | |
# used_topics is an Array with the topics that we want to use | |
used_topics = main_topics.collect do |topic| | |
IO.readlines("#{reuters_dir}/#{topic}.txt","\n").each { |line| line.chomp!} | |
end | |
used_topics.flatten!.sort! | |
Dir.mkdir(write_dir) if Dir[write_dir].empty? | |
Dir.mkdir("#{write_dir}/test") if Dir["#{write_dir}/test"].empty? | |
Dir.mkdir("#{write_dir}/train") if Dir["#{write_dir}/train"].empty? | |
Dir.entries(reuters_dir).slice(2, 50).each do |filename| | |
file_lines = File.open("#{reuters_dir}/#{filename}","r").readlines | |
new_file_lines = [] | |
file_lines.each do |line| | |
line.delete!("#&") #remove characters that confuse libxml parser | |
line.sub!(/\<\!DOCTYPE.*?\>/,"") #remove doctype (not used) | |
line = Iconv::iconv('utf-8', 'cp1251',line).to_s #convert to utf-8 | |
new_file_lines << line | |
if /<\/REUTERS>/ =~ line #end of xml file | |
doc = nil | |
doc_parser = LibXML::XML::Parser.string(new_file_lines.to_s) | |
doc = doc_parser.parse | |
root = doc.find('/REUTERS')[0] | |
topics = doc.find('/REUTERS/TOPICS/D') | |
doc_type = "test" | |
if root.attributes.get_attribute("TOPICS").value == "YES" | |
if root.attributes.get_attribute("LEWISSPLIT").value == "TRAIN" | |
doc_type = "train" | |
end | |
doc_id = root.attributes.get_attribute("NEWID").value | |
unless topics[0].nil? | |
topics.each do |topic| | |
klass = topic.content | |
if used_topics.include?(klass) | |
Dir.mkdir("#{write_dir}/#{doc_type}/#{klass}") if Dir["#{write_dir}/#{doc_type}/#{klass}"].empty? | |
File.open("#{write_dir}/#{doc_type}/#{klass}/reut21578_#{doc_id}.txt","w") do |file| | |
file.write(doc.find('/REUTERS/TEXT')[0].content) | |
end | |
end | |
end | |
end | |
end | |
new_file_lines = [] | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment