Last active
February 5, 2019 15:45
-
-
Save mvrozanti/c3bb414d7233bc79dd60840a45db2d8d to your computer and use it in GitHub Desktop.
Scraper do site portaldalinguaportuguesa.org
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from bs4 import BeautifulSoup | |
import requests | |
import string | |
import json | |
base_url = 'http://www.portaldalinguaportuguesa.org' | |
syllables = base_url + '/index.php?action=syllables&act=list&letter=' | |
session = requests.session() | |
save_path = 'palavras-divisao-silabica.json' | |
try: words_syl_div = json.load(open(save_path, 'rb')) | |
except: words_syl_div = {} | |
for char in string.ascii_lowercase: | |
if 'next_page_for_char' in words_syl_div and char in words_syl_div['next_page_for_char']: | |
next_page_for_char = words_syl_div['next_page_for_char'][char] | |
else: next_page_for_char = syllables+char | |
words_for_char = {} | |
while next_page_for_char: | |
le_html = None | |
while not le_html: | |
try: le_html = session.get(next_page_for_char, timeout=10).text | |
except: pass | |
bs = BeautifulSoup(le_html, 'lxml') | |
tds = bs.find_all('td') | |
word = None | |
div = None | |
for td in tds: | |
if td.has_attr('title'): | |
if word: | |
div = td.text.replace('·', '-').rstrip('\r\n\t') | |
pair = {word:div} | |
if not len(words_for_char) % 100: print(pair) | |
words_for_char.update(pair) | |
word = None | |
else: | |
word = td.text[:td.text.index('(') - 1] | |
_as = bs.find_all('a') | |
next_page_for_char = None | |
for a in _as: | |
if 'seguintes' == a.text: | |
next_page_for_char = base_url + a['href'] | |
words_syl_div.update({'next_page_for_char':{char:next_page_for_char}}) | |
if next_page_for_char is None: next_page_for_char | |
words_syl_div.update({char:words_for_char}) | |
json.dump(words_syl_div, open(save_path, 'w+', encoding='utf8'), ensure_ascii=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Exemplo de output:
$ jq '.l' < palavras-divisao-silabica.json | head
:$ jq '.l.lastro' < palavras-divisao-silabica.json
:"las-tro"
Para caracteres acentuadas:
$ jq '.a."aéride"' < palavras-divisao-silabica.json
:"a-é-ri-de"
Para facilitação da chamada:
divsil(){ [[ -z $@ ]] && return; jq '.'`echo $@|cut -c1`'."'$@'"' < ~/aonde/esta/o/seu/palavras-divisao-silabica.json }
:$ divsil artesão
"ar-te-são"