Created
October 14, 2015 17:58
-
-
Save komly/9adf6d4aa93af8001c60 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import re | |
import json | |
from urllib.request import urlopen | |
start, finish = int(input('Start thread number: ')), int(input('Finish thread number: ')) | |
url = 'http://www.forumishqiptar.com/threads/%d' | |
def find_posts(text): | |
posts = re.findall('<h2 class="title icon">(.*?)</h2>.*?<div id="post_message_.*?">(.*?)</div>',text,flags = re.DOTALL) | |
filtered = [] | |
for title, body in posts: | |
title = re.sub('<.*?>','', title) | |
title = re.sub('\t','', title) | |
title = re.sub('\r','', title) | |
body = re.sub('<.*?>','', body) | |
body = re.sub('\t','', body) | |
body = re.sub('\r','', body) | |
filtered.append({ | |
'title': title, | |
'body': body | |
}) | |
return filtered | |
def get_thread(i): | |
print('Download thread %d' % i) | |
try: | |
data = urlopen(url % i).read().decode('windows-1251') | |
return find_posts(data) | |
except Exception as e: | |
print(e) | |
return None | |
with open('albanian_threads.json', 'wb') as f: | |
data = {} | |
for i in range(start, finish + 1): | |
thread = get_thread(i) | |
if thread: | |
data[url % i] = thread | |
f.write(json.dumps(data).encode('utf-8')) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment