Created
April 30, 2017 13:42
-
-
Save ruanbekker/9d41095043f6d38e7e7cf3674b2e5d6f to your computer and use it in GitHub Desktop.
Scrapes URL, Title and Keywords from Nested Sitemap to Elasticsearch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import requests | |
from bs4 import BeautifulSoup | |
from elasticsearch import Elasticsearch | |
es_client = Elasticsearch(['http://search-domain:9200']) | |
drop_index = es_client.indices.create(index='myindex', ignore=400) | |
create_index = es_client.indices.delete(index='myindex', ignore=[400, 404]) | |
def urlparser(title, url): | |
# scrape title | |
p = {} | |
tag_names = [] | |
post = title | |
page = requests.get(post).content | |
soup = BeautifulSoup(page, 'lxml') | |
title_name = soup.title.string | |
# scrape tags | |
desc = soup.findAll(attrs={"name":"keywords"}) | |
if len(desc) >=1: | |
tag_names = desc[0]['content'].split(',') | |
else: | |
tag_names = [] | |
#pass | |
# payload for elasticsearch | |
doc = { | |
'date': time.strftime("%Y-%m-%d"), | |
'title': title_name, | |
'tags': tag_names, | |
'url': url | |
} | |
# ingest payload into elasticsearch | |
res = es_client.index(index="myindex", doc_type="docs", body=doc) | |
print(res) | |
time.sleep(1.5) | |
sitemap_feed = 'http://www.domain.com/sitemap.xml' | |
page = requests.get(sitemap_feed) | |
sitemap_index = BeautifulSoup(page.content, 'html.parser') | |
urls = [element.text for element in sitemap_index.findAll('loc')] | |
for xx in urls: | |
sub_sitemap_feed = xx | |
sub_page = requests.get(sub_sitemap_feed) | |
sub_sitemap_index = BeautifulSoup(sub_page.content, 'html.parser') | |
sub_urls = [element.text for element in sub_sitemap_index.findAll('loc')] | |
for xxy in sub_urls: | |
urlparser(xxy, xxy) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment