Skip to content

Instantly share code, notes, and snippets.

@komly
Created October 14, 2015 17:58
Show Gist options
  • Save komly/9adf6d4aa93af8001c60 to your computer and use it in GitHub Desktop.
Save komly/9adf6d4aa93af8001c60 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
import json
from urllib.request import urlopen
start, finish = int(input('Start thread number: ')), int(input('Finish thread number: '))
url = 'http://www.forumishqiptar.com/threads/%d'
def find_posts(text):
posts = re.findall('<h2 class="title icon">(.*?)</h2>.*?<div id="post_message_.*?">(.*?)</div>',text,flags = re.DOTALL)
filtered = []
for title, body in posts:
title = re.sub('<.*?>','', title)
title = re.sub('\t','', title)
title = re.sub('\r','', title)
body = re.sub('<.*?>','', body)
body = re.sub('\t','', body)
body = re.sub('\r','', body)
filtered.append({
'title': title,
'body': body
})
return filtered
def get_thread(i):
print('Download thread %d' % i)
try:
data = urlopen(url % i).read().decode('windows-1251')
return find_posts(data)
except Exception as e:
print(e)
return None
with open('albanian_threads.json', 'wb') as f:
data = {}
for i in range(start, finish + 1):
thread = get_thread(i)
if thread:
data[url % i] = thread
f.write(json.dumps(data).encode('utf-8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment