Skip to content

Instantly share code, notes, and snippets.

@slothyrulez
Last active April 6, 2018 10:38
Show Gist options
  • Save slothyrulez/a5cd7d5fd48cb46782805ae91c961e6c to your computer and use it in GitHub Desktop.
Save slothyrulez/a5cd7d5fd48cb46782805ae91c961e6c to your computer and use it in GitHub Desktop.
Basic Wikipedia scrapping
# -*- coding: utf-8 -*-
import pprint
import lxml.html
from urllib import request
def get_page(url):
return request.urlopen(url)
def read_document(response):
return response.read()
def extract_data(document):
# Generate document tree
tree = lxml.html.fromstring(document)
# Select tr with a th and td descendant from table
elements = tree.xpath('//table[@class="infobox"]/tr[th and td]')
# Extract data
result = {}
for element in elements:
th, td = element.iterchildren()
result.update({
th.text_content(): td.text_content()
})
return result
if __name__ == "__main__":
languages = {
"python": "https://es.wikipedia.org/wiki/Python",
"Rust": "https://es.wikipedia.org/wiki/Rust_(lenguaje_de_programaci%C3%B3n)",
"Java": "https://es.wikipedia.org/wiki/Java_(lenguaje_de_programaci%C3%B3n)",
"Javascript": "https://es.wikipedia.org/wiki/JavaScript"
}
result = {}
for name, url in languages.items():
response = get_page(url)
document = read_document(response)
result.update({name: extract_data(document)})
pprint.pprint(result)
@slothyrulez
Copy link
Author

Output:

....
 'python': {'Apareció en': '1991',
            'Dialectos': 'Stackless Python, RPython',
            'Diseñado por': 'Guido van Rossum',
            'Extensiones comunes': '.py, .pyc, .pyd, .pyo, .pyw',
            'Ha influido a': 'Boo, Cobra, D, Falcon, Genie, Groovy, Ruby, '
                             'JavaScript, Cython, Go',
            'Implementaciones': 'CPython, IronPython, Jython, Python for S60, '
                                'PyPy, Pygame, ActivePython, Unladen Swallow',
            'Influido por': 'ABC, ALGOL 68, C, Haskell, Icon, Lisp, Modula-3, '
                            'Perl, Smalltalk, Java',
            'Licencia': 'Python Software Foundation License',
            'Paradigma': 'Multiparadigma: orientado a objetos, imperativo, '
                         'funcional, reflexivo',
            'Sistema de tipos': 'Fuertemente tipado, dinámico',
            'Sistema operativo': 'Multiplataforma',
            'Última versión en pruebas': '3.7.0a3 (6 de diciembre de 2017 (3 '
                                         'meses\xa0y\xa025 días))',
            'Última versión estable': '3.6.5 / 2.7.11 (28 de marzo de 2018 (3 '
                                      'días))'}}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment