Created
December 25, 2022 20:49
-
-
Save adamori/6a7ff9ec9dca08a7989dfc345531777d to your computer and use it in GitHub Desktop.
You can use it by command: `cscript script.vbs jerome.txt 10` jerome.txt - is file with text, 10 - is top words, script.vbs - name of this script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
' See Visual Basic'i skript loeb käsurea argumendina määratud tekstifaili ning töötleb teksti, | |
' et leida ja loendada failis kõige populaarsemad sõnad. Skript loob kõigepealt regulaaravaldise objekti, | |
' mida kasutatakse tekstis olevate üksikute sõnade tuvastamiseks. Seejärel itereerib ta teksti läbi, lisades iga sõna sõnastikuobjektile, | |
' mis salvestab iga sõna arvu. Kui kõik sõnad on töödeldud, sorteerib skript sõnastiku sõnade arvu alusel kahanevas järjekorras ja | |
' väljastab N kõige populaarsemat sõna, kus N on määratud teise käsurea argumendina. | |
' Seejärel teostab skript sarnase protsessi, et leida ja loendada sõnu, mis sisaldavad apostrofe. | |
' Ta kasutab nende sõnade tuvastamiseks teistsugust regulaaravaldist ning loendab ja sorteerib neid samamoodi nagu eelmises etapis. | |
' Lõpuks väljastab skript failis olevate sõnade koguarvu ja unikaalsete sõnade arvu. | |
Set objArgs = WScript.Arguments | |
' If no arguments were passed, output a message and exit the script | |
If objArgs.Count <= 1 Then | |
Wscript.Echo "Enter filename and cound of words" | |
End If | |
' Open the specified text file | |
Set objFileToRead = CreateObject("Scripting.FileSystemObject").OpenTextFile(objArgs(0), 1) | |
' Read the entire contents of the file into a string | |
strFileText = objFileToRead.ReadAll() | |
objFileToRead.Close | |
Set objFileToRead = Nothing | |
' Create a regular expression object to match individual words | |
Set re = New RegExp | |
With re | |
.Pattern = "\S+" | |
.Global = True | |
.IgnoreCase = True | |
End With | |
Set matches = re.Execute(strFileText) | |
' Create a dictionary object to store the word counts | |
Dim found | |
Set found = CreateObject("Scripting.Dictionary") | |
' Iterate through the matches, adding each word to the dictionary | |
For Each match In matches | |
word = LCase(match.Value) | |
' Check for words that contain an apostrophe and replace them with their expanded form | |
If InStr(word, "'") Then | |
if "don't" = word Then | |
word = "do not" | |
ElseIf "can't" = word Then | |
word = "can not" | |
End If | |
' Replace certain words with their expanded form | |
ElseIf "this" = word Then | |
word = "the" | |
ElseIf "his" = word Then | |
word = "he" | |
ElseIf "him" = word Then | |
word = "he" | |
ElseIf "her" = word Then | |
word = "she" | |
End If | |
' Add the word to the dictionary, or increment its count if it already exists | |
If found.Exists(word) Then | |
found(word) = found(word) + 1 | |
Else | |
found.Add word, 1 | |
End If | |
Next | |
' Get the keys and values from the dictionary | |
words_keys = found.keys | |
words_count = found.items | |
words_len = ubound(words_count) | |
' Sort the dictionary by count in descending order | |
For i = 0 To (words_len - 1) | |
min_idx = i | |
For idx = i+1 To (words_len - 1) | |
If words_count(idx) > words_count(min_idx) Then | |
min_idx = idx | |
End If | |
Next | |
temp = words_count(i) | |
temp_key = words_keys(i) | |
words_count(i) = words_count(min_idx) | |
words_keys(i) = words_keys(min_idx) | |
words_count(min_idx) = temp | |
words_keys(min_idx) = temp_key | |
Next | |
' End soring | |
' Output the most popular words in the file | |
Wscript.Echo "The most popular words in", objArgs(0), "are:" | |
Wscript.Echo "" | |
For i = 0 To objArgs(1) | |
If i > 0 Then | |
Wscript.Echo words_keys(i), words_count(i), CInt(words_count(i - 1) / 2) | |
Else | |
Wscript.Echo words_keys(i), words_count(i), CInt(words_count(i)) | |
End If | |
Next | |
' Output some blank lines | |
Wscript.Echo "" | |
Wscript.Echo "" | |
'-------------------------SECOND PART------------------------- | |
Set re2 = New RegExp | |
With re2 | |
.Pattern = "\w+[']\w+" | |
.Global = True | |
.IgnoreCase = True | |
End With | |
Set matches = re2.Execute(strFileText) | |
Set found = CreateObject("Scripting.Dictionary") | |
For Each match In matches | |
word = LCase(match.Value) | |
If found.Exists(word) Then | |
found(word) = found(word) + 1 | |
Else | |
found.Add word, 1 | |
End If | |
Next | |
words_keys = found.keys | |
words_count = found.items | |
words_len = ubound(words_count) | |
'Sort the dictionary by count in descending order | |
For i = 0 To (words_len - 1) | |
min_idx = i | |
For idx = i+1 To (words_len - 1) | |
If words_count(idx) > words_count(min_idx) Then | |
min_idx = idx | |
End If | |
Next | |
temp = words_count(i) | |
temp_key = words_keys(i) | |
words_count(i) = words_count(min_idx) | |
words_keys(i) = words_keys(min_idx) | |
words_count(min_idx) = temp | |
words_keys(min_idx) = temp_key | |
Next | |
' End sorting | |
Wscript.Echo "The most popular still remaining short forms in ", objArgs(0), "are:" | |
Wscript.Echo "" | |
For i = 0 To objArgs(1) | |
If i > 0 Then | |
Wscript.Echo words_keys(i), words_count(i), CInt(words_count(i - 1) / 2) | |
Else | |
Wscript.Echo words_keys(i), words_count(i), CInt(words_count(i)) | |
End If | |
Next |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment