Skip to content

Instantly share code, notes, and snippets.

@fernandojunior
Last active July 5, 2016 18:12
Show Gist options
  • Save fernandojunior/bf1af23abe0c95935a8fc7b9e2d89073 to your computer and use it in GitHub Desktop.
Save fernandojunior/bf1af23abe0c95935a8fc7b9e2d89073 to your computer and use it in GitHub Desktop.
bs beautifulsoup python
# pip install beautifulsoup4
# pip install six
from datetime import datetime
from six.moves.urllib.request import urlopen
from bs4 import BeautifulSoup
def get_posts():
url = 'https://bitcointalk.org/index.php?topic=1463943.0'
html = urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
rows = soup.select('#bodyarea .windowbg, #bodyarea .windowbg2')
for row in rows:
user = row.select('.poster_info b a')[0].contents
time = row.select('.td_headerandpost table .smalltext')[0].contents
message = row.select('.post')[0].contents
if (user == time == message):
continue
user = str(user[0])
time = time[0]
time = time if not hasattr(time, 'contents') else time.contents[0]
time = datetime.strptime(str(time), '%b %d, %Y, %I:%M:%S %p')
message = [str(content) for content in message]
yield {'user': user, 'time': time, 'message': message}
print(list(get_posts()))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment