Created
March 21, 2017 22:15
-
-
Save patdavid/2892e6e4c22975c3d0709e10a35a0ce5 to your computer and use it in GitHub Desktop.
Scraping the digikam website for news content to migrate to a new system
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from __future__ import unicode_literals | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
import time | |
import datetime | |
import os | |
import re | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
# Setup our regex matches | |
# for YYYY-MM-DD | |
yyyymmdd = re.compile('\d\d\d\d-\d\d-\d\d') | |
yyyymmddtime = re.compile('\d\d\d\d-\d\d-\d\d \d\d:\d\d') | |
_submitter = re.compile('by (.+?) on') | |
def truncate_string(mystring, numwords): | |
return ' '.join(mystring.split()[:numwords]) | |
print "#### BEGIN" | |
print "#### %s" % datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') | |
driver = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs') | |
apage = webdriver.PhantomJS(executable_path='/usr/local/bin/phantomjs') | |
baseurl = "http://www.digikam.org/news?page=" | |
for page in range(0,36): | |
#for page in range(0,1): | |
url = baseurl + "%d" % page | |
#print( url ) | |
driver.get( url ) | |
#list = driver.find_elements_by_xpath("//div[contains(@itemprop, 'articleBody')]//table") | |
list = driver.find_elements_by_xpath("//div[contains(@class, 'node')]/h2[contains(@class, 'title')]/a") | |
#print( list ) | |
for entry in list: | |
print( "=====================" ) | |
#print( entry.text ) | |
href = entry.get_attribute('href') | |
#print( entry.get_attribute('href') ) | |
#print( href ) | |
#print( "Fetching..." ) | |
apage.get( href ) | |
title = apage.find_element_by_xpath("//div[contains(@id, 'main')]/h1[contains(@class, 'title')]") | |
submitted = apage.find_element_by_xpath("//div[contains(@class, 'node')]/span[contains(@class, 'submitted')]") | |
content = apage.find_element_by_xpath("//div[contains(@class, 'node')]/div[contains(@class, 'content')]") | |
print( title.text ) | |
#print( submitted.text ) | |
try: | |
shortdate = yyyymmdd.search( submitted.text ).group() | |
except AttributeError: | |
shortdate = "2000-01-01" | |
#print( shortdate) | |
try: | |
longdate = yyyymmddtime.search( submitted.text ).group() | |
# 2017-03-14 09:24 | |
except AttributeError: | |
longdate = "2000-01-01 00:00" | |
#print( longdate ) | |
_date = time.strptime( longdate, "%Y-%m-%d %H:%M" ) | |
#print "_date tuple: %s" % _date | |
_isodate = time.strftime('%Y-%m-%dT%H:%M:%SZ', _date) | |
print( _isodate ) | |
try: | |
submitter = _submitter.search( submitted.text ).group(1) | |
except AttributeError: | |
submitter = "Nobody" | |
print( submitter ) | |
_title = re.sub(r'\.{2,}', '', title.text) # Remove ellipses | |
_title = re.sub(r'[^a-zA-Z0-9-\.]', '_', _title) | |
#filename = shortdate +'_'+ title.text +'.md' | |
filename = shortdate +'_'+ _title +'.md' | |
print( filename ) | |
#print( content.get_attribute('innerHTML') ) | |
_description = truncate_string(content.text, 25) | |
_description = re.sub(r'"', '\"', _description) | |
print( _description ) | |
if not os.path.exists( filename ): | |
#print filename, " doesn't exist - creating..." | |
with open( filename, 'w') as afile: | |
afile.write('---\n') | |
afile.write('date: "'+ _isodate +'"\n') | |
afile.write('title: "'+ title.text.encode('utf-8') +'"\n') | |
afile.write('author: "'+ submitter.encode('utf-8') +'"\n') | |
afile.write('description: "'+ _description +'"\n') | |
afile.write('taxonomies: "news"\n') | |
afile.write('orig_url: "'+ href +'"\n') | |
afile.write('\n---\n\n') | |
afile.write( content.get_attribute('innerHTML').encode('utf-8') ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment