Skip to content

Instantly share code, notes, and snippets.

@powerswitch
Created June 19, 2014 18:21
Show Gist options
  • Save powerswitch/dde9385b52a821dceb28 to your computer and use it in GitHub Desktop.
Save powerswitch/dde9385b52a821dceb28 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from itertools import chain
import os, re
import pp
import sys
def pointsto(url, target):
try:
from lxml import etree
xml = etree.parse("http://de.wikipedia.org"+url, parser = etree.HTMLParser())
outgoing = xml.xpath(".//div[@id='bodyContent']//a")
for link in outgoing:
href = link.get('href')
if href[:6] == '/wiki/' and href in target:
return url
return None
except:
print
print sys.exc_info()
return "E" + url
ppservers = ("node1", "node2", )
#ppservers = ()
job_server = pp.Server(ppservers = ppservers, secret="x", ncpus=0)
print "Starting pp with", job_server.get_ncpus(), "workers"
targets = ["/wiki/Test"]
jobs = []
rf = file("wpindex.csv", "r")
print "Creating jobs..."
for line in rf:
url = ""
parts = line.split(",")
for i in range(len(parts)/2):
url += parts[i]+","
url = url[:-1]
jobs.append(
job_server.submit(
pointsto,
args=(url,targets),
modules=("lxml.etree",)
)
)
if (len(jobs) % 1000 == 0):
print len(jobs), "..."
rf.close()
#print line.split(",",2)[1].decode('utf8')
print "Created", len(jobs), "jobs."
WIDTH = 100
lastwidth = 0
count = 0
results = []
errors = []
error = False
print "[", " " * WIDTH, "]",
wf = file("layer1.csv", "w")
ef = file("error1.csv", "w")
for j in jobs:
count += 1
if (int(count*WIDTH/len(jobs)) > lastwidth) or count % 50 == 0:
lastwidth = int(count*WIDTH/len(jobs))
print "\r[", "#" * (count*WIDTH/len(jobs)), count, "/", len(jobs),
sys.stdout.flush()
result = j()
if result:
if result[0] == "E":
error = True
#errors.append(result[1:])
print "[", " " * WIDTH, "]",
ef.write(result[1:])
ef.write("\n")
else:
#results.append(result)
wf.write(result)
wf.write("\n")
if error:
print "\nFinished with errors"
wf.close()
ef.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment