Created
March 23, 2016 20:34
-
-
Save leandromuto/01947ba5c48f6cd5dce7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This is a template for a Python scraper on morph.io (https://morph.io) | |
# including some code snippets below that you should find helpful | |
import urllib | |
import scraperwiki | |
import pdb | |
from bs4 import BeautifulSoup as bs | |
encoding = 'utf-8' | |
url = 'http://www.paodeacucar.com.br/' | |
# Read in a page | |
html = urllib.urlopen(url).read() | |
# Making a soup | |
soup = bs(html, from_encoding = encoding) | |
# menu_header = soup.select(".nhgpa_list") | |
menu_header = soup.select('div.nhgpa_list a') | |
if menu_header: | |
for item in menu_header: | |
link = item['href'].encode('utf8', 'replace') | |
secao = item.text.encode('utf8', 'replace') | |
print secao | |
scraperwiki.sqlite.save(unique_keys=['secao', 'href'], | |
data={"secao": secao.encode('utf8', 'replace'), "href": link.encode('utf8', 'replace')}, | |
table_name='secoes') | |
# pdb.set_trace() | |
# for secao in scraperwiki.sqlite.select('* FROM secoes'): |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Traceback (most recent call last):
File "/Users/leandromuto/GitHub/pao_de_acucar_produtos/scraper.py", line 28, in
table_name='secoes')
File "/Library/Python/2.7/site-packages/scraperwiki/sql.py", line 203, in save
connection.execute(insert.values(row))
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 914, in execute
return meth(self, multiparams, params)
File "/Library/Python/2.7/site-packages/sqlalchemy/sql/elements.py", line 323, in _execute_on_connection
return connection._execute_clauseelement(self, multiparams, params)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1010, in _execute_clauseelement
compiled_sql, distilled_params
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1146, in _execute_context
context)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1341, in _handle_dbapi_exception
exc_info
File "/Library/Python/2.7/site-packages/sqlalchemy/util/compat.py", line 200, in raise_from_cause
reraise(type(exception), exception, tb=exc_tb, cause=cause)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/base.py", line 1139, in _execute_context
context)
File "/Library/Python/2.7/site-packages/sqlalchemy/engine/default.py", line 450, in do_execute
cursor.execute(statement, parameters)
sqlalchemy.exc.ProgrammingError: (sqlite3.ProgrammingError) You must not use 8-bit bytestrings unless you use a text_factory that can interpret 8-bit bytestrings (like text_factory = str). It is highly recommended that you instead just switch your application to Unicode strings. [SQL: u'INSERT OR REPLACE INTO secoes (href, secao) VALUES (?, ?)'] [parameters: (u'http://www.paodeacucar.com.br/secoes/C5335/pascoa', 'P\xc3\xa1scoa')]