Last active
February 11, 2020 09:34
-
-
Save aaferrari/fbd3d861771aa255b52b9f5500fe9e3b to your computer and use it in GitHub Desktop.
Obtiene la cantidad de comentarios que hay en los posts de bloglenovo.es (funciona con Python 2 y 3)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re, json | |
from requests import request | |
from sys import version_info | |
if version_info.major == 2: | |
from urllib import urlencode | |
elif version_info.major == 3: | |
from urllib.parse import urlencode | |
unichr=chr | |
# Convierte los caracteres codificados como entidades HTML a texto normal | |
def html2char(cadena): | |
entidades = list(set(re.findall("&#([0-9]+);", cadena))) | |
for ent in entidades: | |
cadena = cadena.replace("&#%s;" % ent, unichr(int(ent))) | |
return cadena | |
sitemaps = ["https://www.bloglenovo.es/post-sitemap1.xml", "https://www.bloglenovo.es/post-sitemap2.xml"] | |
enlaces = [] | |
# Obtenemos enlaces de los posts | |
for mapa in sitemaps: | |
peticion = request('GET', mapa) | |
enlaces.extend(re.findall("<loc>([^<]+)</loc>", peticion.text)) | |
# Obtenemos el titulo y cantidad de comentarios de cada post | |
for post in enlaces: | |
comentarios = request('GET', "https://disqus.com/embed/comments/?base=default&f=bloglenovo-es&%s&s_o=default#version=159e1a03b9c07c09a458f7a036c2696b" % urlencode({"t_u": post})) | |
datos = json.loads("{" + re.findall('"posts":[0-9]+.+"title":"[^"]+"', comentarios.text)[0] + "}") | |
datos["title"] = html2char(datos["title"]) | |
print("%s;%s;%i" % (post, datos["title"], datos["posts"])) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment