zackster · August 29, 2016 16:48
diff --git a/process_8k.rb b/process_8k.rb
 require 'nokogiri'
 require 'pg'
 require 'json'
 require 'parallel'
 require 'timeout'

 def score( array )
  array.each_with_object(Hash.new(0)){|key,hash| hash[key] += 1}
 end


 pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
 results = pg_client.query("select filename from form_8k_ngrams where n2_grams is null and random()<0.1").to_a
 pg_client.close

 Parallel.each_with_index(results) do |result_row, idx|

        original_filename = result_row['filename']
        filename = original_filename.split('/').last
        filename_glob = filename.split('.')[0..-2].join('')

        expected_file_path = "/path/to/forms/8K/#{filename}"

        if File.exist?(expected_file_path) && File.size?(expected_file_path) && File.size?(expected_file_path)> 0
                path_to_actual_file = expected_file_path
                puts 'file exists'
        else
                path_to_actual_file = Dir.glob("/path/to/forms/8K/#{filename_glob}*").first
                puts 'could not find file'
        end

        next if path_to_actual_file.nil?

        begin
                Timeout::timeout(30) do
                        doc = open(path_to_actual_file).read
                        # cik = doc.match(/CENTRAL INDEX KEY.+/)[0].split(/\s+/).last
                        begin                                
                             ad_match = doc.match('ACCEPTANCE-DATETIME>(.+)')                        
                        rescue                                
                             next                        
                        end
                        next if ad_match.nil?
                        acceptance_datetime = ad_match[1]

                        no_images = doc.split('<TEXT>').reject{|fragment|fragment.include?'begin 644'}[1..-1].join(' ')
                        clean_text = Nokogiri::HTML(no_images).text.split('').map{|x| if x.ord==160; ' '; else; x; end }.join('').gsub(/\s+/mx,' ').gsub(/“|”|’/,"'").downcase.gsub(/[:;.]/, '')


                        stopwords = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","d","ll","m","o","re","ve","y","ain","aren","couldn","didn","doesn","hadn","hasn","haven","isn","ma","mightn","mustn","needn","shan","shouldn","wasn","weren","won","wouldn"]

                        no_stopwords_text = clean_text.split(' ').reject{|word| stopwords.include?(word) }.join(' ')

                        n_grams = {}
                        [2,3,4].each do |n|
                                n_grams[n] = score(no_stopwords_text.split(' ').each_cons(n).to_a).sort_by {|_key, value| value}.to_h.to_json
                        end

                        pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
                        pg_client.prepare("f8k_ngram_statement#{idx}", "update form_8k_ngrams set n2_grams=$1,n3_grams=$2,n4_grams=$3,acceptance_datetime=$4 where filename=$5")
                        pg_client.exec_prepared("f8k_ngram_statement#{idx}", [ n_grams[2],n_grams[3],n_grams[4],acceptance_datetime,original_filename] )
                        pg_client.close

                        puts "success! #{idx}"
                end
        rescue Timeout::Error
                puts "skipped #{filename} because too slow"
        end
 end
	require 'nokogiri'
	require 'pg'
	require 'json'
	require 'parallel'
	require 'timeout'

	def score( array )
	array.each_with_object(Hash.new(0)){\|key,hash\| hash[key] += 1}
	end


	pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
	results = pg_client.query("select filename from form_8k_ngrams where n2_grams is null and random()<0.1").to_a
	pg_client.close

	Parallel.each_with_index(results) do \|result_row, idx\|

	original_filename = result_row['filename']
	filename = original_filename.split('/').last
	filename_glob = filename.split('.')[0..-2].join('')

	expected_file_path = "/path/to/forms/8K/#{filename}"

	if File.exist?(expected_file_path) && File.size?(expected_file_path) && File.size?(expected_file_path)> 0
	path_to_actual_file = expected_file_path
	puts 'file exists'
	else
	path_to_actual_file = Dir.glob("/path/to/forms/8K/#{filename_glob}*").first
	puts 'could not find file'
	end

	next if path_to_actual_file.nil?

	begin
	Timeout::timeout(30) do
	doc = open(path_to_actual_file).read
	# cik = doc.match(/CENTRAL INDEX KEY.+/)[0].split(/\s+/).last
	begin
	ad_match = doc.match('ACCEPTANCE-DATETIME>(.+)')
	rescue
	next
	end
	next if ad_match.nil?
	acceptance_datetime = ad_match[1]

	no_images = doc.split('<TEXT>').reject{\|fragment\|fragment.include?'begin 644'}[1..-1].join(' ')
	clean_text = Nokogiri::HTML(no_images).text.split('').map{\|x\| if x.ord==160; ' '; else; x; end }.join('').gsub(/\s+/mx,' ').gsub(/“\|”\|’/,"'").downcase.gsub(/[:;.]/, '')


	stopwords = ["i","me","my","myself","we","our","ours","ourselves","you","your","yours","yourself","yourselves","he","him","his","himself","she","her","hers","herself","it","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","should","now","d","ll","m","o","re","ve","y","ain","aren","couldn","didn","doesn","hadn","hasn","haven","isn","ma","mightn","mustn","needn","shan","shouldn","wasn","weren","won","wouldn"]

	no_stopwords_text = clean_text.split(' ').reject{\|word\| stopwords.include?(word) }.join(' ')

	n_grams = {}
	[2,3,4].each do \|n\|
	n_grams[n] = score(no_stopwords_text.split(' ').each_cons(n).to_a).sort_by {\|_key, value\| value}.to_h.to_json
	end

	pg_client = PGconn.connect(dbname: ENV['postgres_db'], user: ENV['postgres_user'], password: ENV['postgres_user_pw'])
	pg_client.prepare("f8k_ngram_statement#{idx}", "update form_8k_ngrams set n2_grams=$1,n3_grams=$2,n4_grams=$3,acceptance_datetime=$4 where filename=$5")
	pg_client.exec_prepared("f8k_ngram_statement#{idx}", [ n_grams[2],n_grams[3],n_grams[4],acceptance_datetime,original_filename] )
	pg_client.close

	puts "success! #{idx}"
	end
	rescue Timeout::Error
	puts "skipped #{filename} because too slow"
	end
	end
No results found