Created
February 10, 2021 23:07
-
-
Save starenka/7395d4b5f7e30f680ed35d722d4d44d5 to your computer and use it in GitHub Desktop.
word freqs for hp1 in en, es, cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[EN] | |
- unique: 4581 | |
- most common: | |
Harry: 1400 | |
say: 879 | |
Ron: 434 | |
look: 419 | |
Hagrid: 371 | |
know: 310 | |
go: 279 | |
Hermione: 274 | |
get: 267 | |
think: 239 | |
like: 216 | |
come: 212 | |
tell: 186 | |
Professor: 178 | |
try: 174 | |
Snape: 172 | |
Dudley: 165 | |
Dumbledore: 156 | |
time: 144 | |
Vernon: 138 | |
want: 136 | |
find: 135 | |
right: 133 | |
Uncle: 131 | |
eye: 127 | |
Malfoy: 127 | |
door: 124 | |
good: 123 | |
face: 122 | |
see: 121 | |
head: 121 | |
turn: 119 | |
hand: 117 | |
Neville: 117 | |
leave: 112 | |
Quirrell: 112 | |
hear: 111 | |
Potter: 111 | |
take: 109 | |
boy: 104 | |
thing: 104 | |
way: 104 | |
ask: 102 | |
stand: 99 | |
McGonagall: 98 | |
room: 97 | |
year: 95 | |
point: 94 | |
yeh: 94 | |
people: 87 | |
Mr.: 84 | |
sit: 84 | |
feel: 84 | |
Hogwarts: 83 | |
start: 82 | |
Gryffindor: 82 | |
let: 79 | |
long: 79 | |
stop: 78 | |
ter: 78 | |
open: 77 | |
Petunia: 76 | |
day: 75 | |
wand: 75 | |
pull: 74 | |
little: 73 | |
away: 72 | |
give: 72 | |
walk: 72 | |
oh: 72 | |
school: 72 | |
suddenly: 70 | |
wizard: 70 | |
letter: 69 | |
old: 69 | |
yes: 69 | |
foot: 69 | |
owl: 67 | |
stare: 67 | |
bit: 67 | |
voice: 67 | |
book: 67 | |
fall: 66 | |
catch: 65 | |
Quidditch: 64 | |
Aunt: 64 | |
Stone: 62 | |
watch: 61 | |
inside: 61 | |
hold: 60 | |
great: 60 | |
second: 60 | |
yer: 58 | |
Dursley: 57 | |
black: 57 | |
remember: 57 | |
lot: 56 | |
floor: 56 | |
end: 56 | |
large: 55 | |
[ES] | |
- unique: 5437 | |
- most common: | |
y: 1921 | |
a: 1676 | |
Harry: 1299 | |
Ron: 417 | |
Hagrid: 364 | |
Hermione: 272 | |
mirar: 241 | |
poder: 233 | |
profesor: 215 | |
Y: 200 | |
volver: 183 | |
Snape: 166 | |
parecer: 165 | |
preguntar: 153 | |
Dumbledore: 149 | |
ver: 146 | |
haber: 145 | |
tío: 143 | |
señor: 143 | |
ir: 142 | |
tener: 138 | |
Dudley: 135 | |
pensar: 132 | |
puerta: 130 | |
pasar: 128 | |
decir: 125 | |
querer: 124 | |
estar: 123 | |
Malfoy: 123 | |
cabeza: 122 | |
saber: 115 | |
casar: 114 | |
Vernon: 114 | |
oír: 113 | |
Neville: 112 | |
Quirrell: 110 | |
salir: 108 | |
Potter: 104 | |
tratar: 104 | |
encontrar: 103 | |
hacer: 102 | |
Dursley: 102 | |
hablar: 101 | |
ojo: 99 | |
año: 98 | |
poner: 96 | |
esperar: 95 | |
McGonagall: 95 | |
deber: 92 | |
gritar: 91 | |
Gryffindor: 88 | |
vestir: 87 | |
voz: 85 | |
escoba: 83 | |
sentir: 82 | |
noche: 82 | |
seguir: 81 | |
o: 80 | |
llevar: 77 | |
Hogwarts: 76 | |
comenzar: 76 | |
dejar: 75 | |
caer: 74 | |
entrar: 74 | |
varita: 73 | |
llegar: 72 | |
abrir: 72 | |
aire: 71 | |
clase: 68 | |
padre: 68 | |
mago: 66 | |
llamar: 66 | |
gente: 66 | |
punto: 66 | |
buscar: 66 | |
carta: 65 | |
colegiar: 64 | |
Oh: 64 | |
lechuzo: 61 | |
capar: 61 | |
Piedra: 61 | |
pequeño: 59 | |
correr: 59 | |
manir: 59 | |
Weasley: 59 | |
soler: 58 | |
jugar: 57 | |
coger: 57 | |
suceder: 57 | |
dar: 57 | |
Petunia: 57 | |
levantar: 57 | |
muchacho: 57 | |
Slytherin: 57 | |
alto: 56 | |
caro: 56 | |
Wood: 56 | |
chico: 55 | |
quidditch: 54 | |
aparecer: 54 | |
[CS] | |
- unique: 7707 | |
- most common: | |
Harry: 1269 | |
on: 659 | |
říci: 512 | |
Ron: 405 | |
mít: 349 | |
Hagrid: 300 | |
vědět: 279 | |
být: 270 | |
ten: 240 | |
Hermion: 230 | |
moci: 218 | |
se: 196 | |
stát: 192 | |
vidět: 171 | |
muset: 169 | |
všechen: 149 | |
celý: 148 | |
říkat: 147 | |
Snape: 146 | |
hlava: 144 | |
chvíle: 138 | |
jít: 138 | |
dostat: 132 | |
Dudley: 131 | |
dveře: 130 | |
Brumbál: 130 | |
oko: 127 | |
zeptat: 125 | |
ruka: 124 | |
pan: 118 | |
myslit: 113 | |
Malfoy: 113 | |
velký: 111 | |
svůj: 108 | |
podívat: 104 | |
strýc: 104 | |
nějaký: 102 | |
chtít: 101 | |
velice: 99 | |
dokázat: 99 | |
poněvadž: 98 | |
vypadat: 98 | |
McGonagallová: 96 | |
slyšet: 95 | |
Vernon: 95 | |
opravdu: 94 | |
dělat: 93 | |
nikdy: 92 | |
začít: 92 | |
kolem: 90 | |
profesor: 90 | |
profesorka: 89 | |
malý: 86 | |
sám: 86 | |
dobrý: 86 | |
znovu: 86 | |
všecek: 84 | |
jeden: 84 | |
zpátky: 83 | |
dát: 82 | |
chlapec: 80 | |
člověk: 80 | |
najít: 79 | |
žádný: 78 | |
Bradavice: 77 | |
takový: 75 | |
hodina: 75 | |
myslet: 75 | |
udělat: 74 | |
Quirrell: 74 | |
dlouhý: 73 | |
druhý: 73 | |
oba: 73 | |
přijít: 72 | |
ostatní: 70 | |
škola: 70 | |
slovo: 69 | |
Nevill: 69 | |
rok: 68 | |
noha: 68 | |
kámen: 67 | |
paní: 67 | |
úplně: 67 | |
nikdo: 67 | |
dopis: 67 | |
místnost: 67 | |
Nebelvír: 67 | |
čekat: 66 | |
spíše: 66 | |
který: 65 | |
teta: 65 | |
vrátit: 65 | |
hůlka: 65 | |
koště: 65 | |
sedět: 64 | |
místo: 63 | |
tvář: 62 | |
Dursley: 61 | |
jediný: 61 | |
černý: 61 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding=utf-8 | |
# spacy download es_core_news_md en_core_web_md | |
# python -c "import spacy_udpipe; spacy_udpipe.download('cs')" | |
from collections import Counter | |
import spacy | |
import spacy_udpipe | |
def get_words(doc, freq_thresh=100): | |
words = [token.lemma_ for token in doc | |
if token.is_stop != True | |
and token.is_punct != True | |
and token.text.isspace() != True] | |
return len(set(words)), Counter(words).most_common(freq_thresh) | |
def dump(unique, freqs, lang): | |
print('\n\n[%s]' % lang.upper()) | |
print('- unique: %d' % unique) | |
print('- most common:') | |
for w, f in freqs: | |
print('%s: %d' % (w, f)) | |
en, es = spacy.load('en_core_web_md'), spacy.load('es_core_news_md') | |
cs = spacy_udpipe.load('cs') | |
doc_en = en(open('data/harry_potter_and_the_sorcerers_-_j.k._rowling.txt').read()) | |
doc_es = es(open('data/Harry_Potter_y_la_Piedra_Filosofal_01.txt').read()) | |
doc_cs = cs(open('data/Rowlingová_J_K-1-Harry Potter a Kámen mudrců.txt').read()) | |
wen, wes, wcs = get_words(doc_en), get_words(doc_es), get_words(doc_cs) | |
dump(*wen, 'en') | |
dump(*wes, 'es') | |
dump(*wcs, 'cs') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment