Skip to content

Instantly share code, notes, and snippets.

@aaferrari
Last active February 11, 2020 09:34
Show Gist options
  • Save aaferrari/fbd3d861771aa255b52b9f5500fe9e3b to your computer and use it in GitHub Desktop.
Save aaferrari/fbd3d861771aa255b52b9f5500fe9e3b to your computer and use it in GitHub Desktop.
Obtiene la cantidad de comentarios que hay en los posts de bloglenovo.es (funciona con Python 2 y 3)
import re, json
from requests import request
from sys import version_info
if version_info.major == 2:
from urllib import urlencode
elif version_info.major == 3:
from urllib.parse import urlencode
unichr=chr
# Convierte los caracteres codificados como entidades HTML a texto normal
def html2char(cadena):
entidades = list(set(re.findall("&#([0-9]+);", cadena)))
for ent in entidades:
cadena = cadena.replace("&#%s;" % ent, unichr(int(ent)))
return cadena
sitemaps = ["https://www.bloglenovo.es/post-sitemap1.xml", "https://www.bloglenovo.es/post-sitemap2.xml"]
enlaces = []
# Obtenemos enlaces de los posts
for mapa in sitemaps:
peticion = request('GET', mapa)
enlaces.extend(re.findall("<loc>([^<]+)</loc>", peticion.text))
# Obtenemos el titulo y cantidad de comentarios de cada post
for post in enlaces:
comentarios = request('GET', "https://disqus.com/embed/comments/?base=default&f=bloglenovo-es&%s&s_o=default#version=159e1a03b9c07c09a458f7a036c2696b" % urlencode({"t_u": post}))
datos = json.loads("{" + re.findall('"posts":[0-9]+.+"title":"[^"]+"', comentarios.text)[0] + "}")
datos["title"] = html2char(datos["title"])
print("%s;%s;%i" % (post, datos["title"], datos["posts"]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment