Created
July 3, 2015 17:56
-
-
Save sixtyfive/c03bc1bbb91a8e61b849 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'sorting/convenience' | |
unless ARGV.length > 1 | |
puts "Usage: ./count_words.rb <corpus file> <word frequency file>" | |
exit | |
end | |
corpus_file = ARGV[0] | |
corpus = File.read(corpus_file) | |
replacements = [ | |
[/\\href{(.*)\.flac}/, ''], # remove links to audio fragment files | |
[/[[:punct:][:digit:][:cntrl:]]/, ''], # remove Latin punctuation, Arabic numerals and control characters | |
[/[؟،ـ–…]/, ''], # remove Arabic punctuation characters | |
[/[٠١٢٣٤٥٦٧٨٩]/, ''], # remove Indian numerals | |
[/[\u0651\u0652\u064e\u064b\u064f\u064c\u0650\u064d]/, ''], # remove sukūn, šadda, fatḥa, fatḥatān, ḍamma, ḍammatān, kasra, kasratān | |
['ﺃ', 'ﺍ'], # replace ālif and hamza above with ālif only | |
['ﻻ', 'لا'], # replace lām-ālif-ligature with discrete lām and ālif | |
# split the definite article "al" and whatever follows it while making sure not to rip apart allī (i.e. allaḏī) or aḷḷā | |
[/ال(?!لي )?<word> /, 'ال [:word] '], | |
[/([ ^]ال)(?!ل[هي])/, ' ال '], | |
# safe to tokenize. needs to happen manually though. | |
[' ولا', ' و لا'], | |
[' وتعالى', 'و تعالى'], | |
[' للي', ' اللي'], # same word, different pronounciation | |
[' لله', ' الله'], # same word, different orthography | |
['الامثال', 'ال امثال'], | |
['بالاصدقاء', 'ب ال اصدقاء'], | |
['سليماناسكت', 'سليمان اسكت'], | |
['خميسسلام', 'خميس سلام'] | |
] | |
replacements.each do |replacement| | |
corpus.gsub!(replacement[0], replacement[1]) | |
end | |
words = corpus.split(/\s+/) | |
frequencies = Hash.new(0) | |
words.each {|word| frequencies[word] += 1} | |
frequencies = frequencies.sort_by {|x, y| [asc(y), desc(x)]} | |
frequencies.reverse! | |
File.open(ARGV[1], 'w') do |word_frequency_file| | |
frequencies.each {|word, frequency| word_frequency_file.puts "\t#{frequency}\t#{word}"} | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment