Created
October 24, 2018 03:57
-
-
Save GabrielMMelo/a428b2ccbfcfb32fe8970bd646d15735 to your computer and use it in GitHub Desktop.
Beautiful soup to get data (web mining)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import peewee | |
from peewee import * | |
from bs4 import BeautifulSoup | |
import sys | |
import os | |
db = MySQLDatabase("redacao_inteligente", user="root", passwd="emakersjr") | |
class Blog(peewee.Model): | |
title = peewee.CharField() | |
text = peewee.TextField() | |
author = peewee.CharField() | |
author_url = peewee.CharField() | |
class Meta: | |
database = db | |
files = os.listdir(sys.argv[1]) | |
for file in files: | |
print(file) | |
try: | |
fileobj = open(sys.argv[1] + file, 'r') | |
except IndexError: | |
fileobj = sys.stdin | |
with fileobj: | |
data = fileobj.read() | |
soup = BeautifulSoup(data, 'html.parser') | |
if soup.find('div',class_='container_conteudo_publicacao').a is None: | |
author = "" | |
author_url = "" | |
else: | |
author = soup.find('div',class_='container_conteudo_publicacao').a.getText() | |
author_url = soup.find('div',class_='container_conteudo_publicacao').a['href'] | |
if soup.find('div',class_='container_conteudo_publicacao').h1 is None: | |
title = file | |
else: | |
title = soup.find('div',class_='container_conteudo_publicacao').h1.getText() | |
if soup.find('div', class_='container_conteudo_publicacao').article is None: | |
text = soup.find('div',class_='container_conteudo_publicacao').getText() | |
else: | |
text = soup.find('div',class_='container_conteudo_publicacao').article.getText() | |
Blog.create_table() | |
blog = Blog(title=title, author=author, author_url=author_url, text=text) | |
blog.save() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
for entry in "$1"/*.html | |
do | |
python3 blog.py $entry | |
echo "$entry" | |
done | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import peewee | |
from peewee import * | |
db = MySQLDatabase("redacao_inteligente", user="root", # your username | |
passwd="emakersjr") # your password | |
class Blog(peewee.Model): | |
title = peewee.CharField() | |
text = peewee.TextField() | |
author = peewee.CharField() | |
author_url = peewee.CharField() | |
class Meta: | |
database = db | |
Blog.create_table() | |
blog = Blog(title="Introducao", author="eu", author_url="uiutubiuponcotom", text="LALLALALAL ALLALALALLALLALA") | |
blog.save() | |
# you must create a Cursor object. It will let | |
# you execute all the queries you need | |
#cur = db.cursor() | |
# Use all the SQL you like | |
#cur.execute("SELECT * FROM blog") | |
# print all the first cell of all the rows | |
#for row in cur.fetchall(): | |
# print (row[1]) | |
db.close() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import peewee | |
from peewee import * | |
from bs4 import BeautifulSoup | |
import sys | |
import os | |
db = MySQLDatabase("redacao_inteligente", user="root", passwd="emakersjr") | |
class Tema(peewee.Model): | |
title = peewee.CharField() | |
text = peewee.TextField() | |
author = peewee.CharField() | |
author_url = peewee.CharField() | |
class Meta: | |
database = db | |
files = os.listdir(sys.argv[1]) | |
for file in files: | |
print(file) | |
try: | |
fileobj = open(sys.argv[1] + file, 'r') | |
except IndexError: | |
fileobj = sys.stdin | |
with fileobj: | |
data = fileobj.read() | |
soup = BeautifulSoup(data, 'html.parser') | |
author = "" | |
author_url = "" | |
if soup.find('h1',class_='barra_paginas_internas') is None: | |
title = file | |
else: | |
title = soup.find('h1',class_='barra_paginas_internas').getText() | |
if soup.find('div', class_='container_conteudo_publicacao').article is None: | |
text = soup.find('div',class_='container_conteudo_publicacao').getText() | |
else: | |
text = soup.find('div',class_='container_conteudo_publicacao').article.getText() | |
Tema.create_table() | |
tema = Tema(title=title, author=author, author_url=author_url, text=text) | |
tema.save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment