Created
December 17, 2019 19:18
-
-
Save LCPallares/afa8e918c082cc6cd9c21bb0a2212343 to your computer and use it in GitHub Desktop.
Ejemplos de raspado web(web scraping) con pagina de ejemplo http://example.webscraping.com
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
URL_BASE = 'http://example.webscraping.com' | |
peticion = requests.get(URL_BASE) | |
sopa = BeautifulSoup(peticion.content, 'html.parser') | |
paises = sopa.find_all(class_="span12")[1].find(id="results") | |
for pais in paises.find_all('a'): | |
nom = pais.text | |
elx = URL_BASE + pais["href"] | |
# print(elx) | |
peticion2 = requests.get(elx) | |
sopa2 = BeautifulSoup(peticion2.content, 'html.parser') | |
cont = sopa2.find_all(class_="span12")[1] | |
paiz = cont.find(id="places_country__row").find(class_="w2p_fw").text | |
capital = cont.find(id="places_capital__row").find(class_="w2p_fw").text | |
print(f'{capital} - {paiz}') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, json | |
from bs4 import BeautifulSoup | |
class ClaseRaspador: | |
def __init__(self): | |
self.URL_BASE = 'http://example.webscraping.com' | |
self.sopa = BeautifulSoup(requests.get(self.URL_BASE).content, 'html.parser') | |
self.enlaces = [] | |
def raspar(self): | |
Datos = [] | |
paises = self.sopa.find_all(class_="span12")[1].find(id="results") | |
for pais in paises.find_all('a'): | |
nom = pais.text | |
elx = self.URL_BASE + pais["href"] | |
Datos.append({"NOMBRE": nom, "ENLACE": elx}) | |
self.enlaces.append(elx) | |
return Datos | |
r = ClaseRaspador() | |
print(r.raspar()) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
URL_BASE = 'http://example.webscraping.com' | |
peticion = requests.get(URL_BASE) | |
sopa = BeautifulSoup(peticion.content, 'html.parser') | |
paises = sopa.find_all(class_="span12")[1].find(id="results") | |
for pais in paises.find_all('a'): | |
nom = pais.text | |
elx = URL_BASE + pais["href"] | |
# print(elx) | |
peticion2 = requests.get(elx) | |
sopa2 = BeautifulSoup(peticion2.content, 'html.parser') | |
cont = sopa2.find_all(class_="span12")[1] | |
paiz = cont.find(id="places_country__row").find(class_="w2p_fw").text | |
capital = cont.find(id="places_capital__row").find(class_="w2p_fw").text | |
print(f'{capital} - {paiz}') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
import requests | |
from bs4 import BeautifulSoup | |
from time import sleep | |
import json | |
import csv | |
import pandas as pd | |
BASE_URL = 'http://example.webscraping.com' | |
URL = 'http://example.webscraping.com/places/default/index/' | |
def make_soup(url): | |
peticion = requests.get(url) | |
return BeautifulSoup(peticion.content, 'html.parser') | |
def obtener_enlaces(sopa): | |
enlaces = [] | |
paises = sopa.find_all(class_="span12")[1].find(id="results") | |
for pais in paises.find_all('a'): | |
nombrepais = pais.text | |
enlacepais = BASE_URL + pais["href"] | |
enlaces.append(enlacepais) | |
return enlaces | |
def obtener_info(enlaces): | |
cont = enlaces.find_all(class_="span12")[1] | |
paiz = cont.find(id="places_country__row").find(class_="w2p_fw").text | |
capital = cont.find(id="places_capital__row").find(class_="w2p_fw").text | |
# print(f'{capital} - {paiz}') | |
# return {"Capital": capital, "Pais": paiz} | |
return {"Pais": paiz, "Capital": capital} | |
def guardar(resultado, archivo): | |
for elemento in resultado: | |
archivo.write(f'{elemento}\n') | |
def guardarjson(resultado, archivo): | |
archivo.write(resultado) | |
def main(): | |
Datos = [] | |
n = 0 | |
while True: | |
print('---------------------------------------------------------') | |
print(f'pagina: {n}') | |
print('---------------------------------------------------------') | |
enlaces = obtener_enlaces(make_soup(f'{URL}{n}')) | |
# for enlace in obtener_enlaces(make_soup(f'{URL}{n}')): | |
for enlace in enlaces: | |
capitales = obtener_info(make_soup(enlace)) | |
#print(capitales) | |
Datos.append(capitales) | |
sleep(1) | |
n += 1 | |
guardarjson(capitales, open("capitales.json", "w")) | |
# guardar(capitales, open("capitales.txt", "w")) | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment