m3mike · August 29, 2015 14:06
diff --git a/Reuters21578_sgml_to_txt.rb b/Reuters21578_sgml_to_txt.rb
 # =Split Reuters-21578 
 # =(Found at: http://www.daviddlewis.com/resources/testcollections/reuters21578/)
 # =SGML files into separate TXT files
 # 
 # Documents selected are those from LEWIS SPLIT that have at least one topic.
 # Documents (only the body of text) are put in directories according to their type (train/test) and topic.
 # Documents with more than one topic are written in more than one folder.
 # Only documents that have a topic listed in  'used_topics' Array are selected.
 # This pre-processing is useful for text categorization applications.
 #
 # Author: Hugo.Borges _at_ gmail
 #
 # Release date: 2008 09 02

 require 'libxml'
 require 'iconv'

 reuters_dir = "reuters21578" # location of the sgml files to process
 write_dir = "reuters21578txt" # write dir

 main_topics = %w{commodities metals financial energy} # name of txt files containing the topics that will be used

 # used_topics  is an Array with the topics that we want to use
 used_topics = main_topics.collect do |topic|
 	IO.readlines("#{reuters_dir}/#{topic}.txt","\n").each { |line| line.chomp!}
 end

 used_topics.flatten!.sort!

 Dir.mkdir(write_dir) if Dir[write_dir].empty?
 Dir.mkdir("#{write_dir}/test") if Dir["#{write_dir}/test"].empty?
 Dir.mkdir("#{write_dir}/train") if Dir["#{write_dir}/train"].empty?


 Dir.entries(reuters_dir).slice(2, 50).each do |filename|
  
  file_lines = File.open("#{reuters_dir}/#{filename}","r").readlines

  new_file_lines = []

  file_lines.each do |line|
    line.delete!("#&") #remove characters that confuse libxml parser
    line.sub!(/\<\!DOCTYPE.*?\>/,"") #remove doctype (not used)
    line = Iconv::iconv('utf-8', 'cp1251',line).to_s #convert to utf-8
    new_file_lines << line

    if /<\/REUTERS>/ =~ line #end of xml file
      
      doc = nil
      doc_parser = LibXML::XML::Parser.string(new_file_lines.to_s)
      doc = doc_parser.parse
      root =  doc.find('/REUTERS')[0]
      topics = doc.find('/REUTERS/TOPICS/D')
      doc_type = "test"
      
      if root.attributes.get_attribute("TOPICS").value == "YES"
        
        if root.attributes.get_attribute("LEWISSPLIT").value == "TRAIN" 
          doc_type = "train"
        end
        
        doc_id = root.attributes.get_attribute("NEWID").value
        
        unless topics[0].nil?
 					topics.each do |topic|
 						klass = topic.content
 						if used_topics.include?(klass)
 							Dir.mkdir("#{write_dir}/#{doc_type}/#{klass}") if Dir["#{write_dir}/#{doc_type}/#{klass}"].empty?          
 							File.open("#{write_dir}/#{doc_type}/#{klass}/reut21578_#{doc_id}.txt","w") do |file| 
 								file.write(doc.find('/REUTERS/TEXT')[0].content)
 							end
 						end
 						
          end
          
        end
        
      end
      
      new_file_lines = []
      
    end
    
  end
  
 end
	# =Split Reuters-21578
	# =(Found at: http://www.daviddlewis.com/resources/testcollections/reuters21578/)
	# =SGML files into separate TXT files
	#
	# Documents selected are those from LEWIS SPLIT that have at least one topic.
	# Documents (only the body of text) are put in directories according to their type (train/test) and topic.
	# Documents with more than one topic are written in more than one folder.
	# Only documents that have a topic listed in 'used_topics' Array are selected.
	# This pre-processing is useful for text categorization applications.
	#
	# Author: Hugo.Borges _at_ gmail
	#
	# Release date: 2008 09 02

	require 'libxml'
	require 'iconv'

	reuters_dir = "reuters21578" # location of the sgml files to process
	write_dir = "reuters21578txt" # write dir

	main_topics = %w{commodities metals financial energy} # name of txt files containing the topics that will be used

	# used_topics is an Array with the topics that we want to use
	used_topics = main_topics.collect do \|topic\|
	IO.readlines("#{reuters_dir}/#{topic}.txt","\n").each { \|line\| line.chomp!}
	end

	used_topics.flatten!.sort!

	Dir.mkdir(write_dir) if Dir[write_dir].empty?
	Dir.mkdir("#{write_dir}/test") if Dir["#{write_dir}/test"].empty?
	Dir.mkdir("#{write_dir}/train") if Dir["#{write_dir}/train"].empty?


	Dir.entries(reuters_dir).slice(2, 50).each do \|filename\|

	file_lines = File.open("#{reuters_dir}/#{filename}","r").readlines

	new_file_lines = []

	file_lines.each do \|line\|
	line.delete!("#&") #remove characters that confuse libxml parser
	line.sub!(/\<\!DOCTYPE.*?\>/,"") #remove doctype (not used)
	line = Iconv::iconv('utf-8', 'cp1251',line).to_s #convert to utf-8
	new_file_lines << line

	if /<\/REUTERS>/ =~ line #end of xml file

	doc = nil
	doc_parser = LibXML::XML::Parser.string(new_file_lines.to_s)
	doc = doc_parser.parse
	root = doc.find('/REUTERS')[0]
	topics = doc.find('/REUTERS/TOPICS/D')
	doc_type = "test"

	if root.attributes.get_attribute("TOPICS").value == "YES"

	if root.attributes.get_attribute("LEWISSPLIT").value == "TRAIN"
	doc_type = "train"
	end

	doc_id = root.attributes.get_attribute("NEWID").value

	unless topics[0].nil?
	topics.each do \|topic\|
	klass = topic.content
	if used_topics.include?(klass)
	Dir.mkdir("#{write_dir}/#{doc_type}/#{klass}") if Dir["#{write_dir}/#{doc_type}/#{klass}"].empty?
	File.open("#{write_dir}/#{doc_type}/#{klass}/reut21578_#{doc_id}.txt","w") do \|file\|
	file.write(doc.find('/REUTERS/TEXT')[0].content)
	end
	end

	end

	end

	end

	new_file_lines = []

	end

	end

	end