Skip to content

Instantly share code, notes, and snippets.

@clonn
Created January 27, 2022 14:33
Show Gist options
  • Save clonn/060ce98263c8fba45ab90ddd1965f55f to your computer and use it in GitHub Desktop.
Save clonn/060ce98263c8fba45ab90ddd1965f55f to your computer and use it in GitHub Desktop.
requests by BeautifulSoup
import re
import sqlite3
import uuid
import requests
from bs4 import BeautifulSoup
MAX_PAGE = 10000
COOKIES = {
'over18': '1'
}
con = sqlite3.connect('ptt.sqlite')
cur = con.cursor()
cur.execute("create table if not exists post (id, author, title)")
root_url = 'https://www.ptt.cc/bbs'
root_page = f'{root_url}/Gossiping'
def get_post_urls(page):
page_url = f'{root_page}/index{page}.html'
res = requests.get(page_url, cookies=COOKIES)
text = res.content
urls = re.findall(r'/bbs/Gossiping/M[\w+\.]*.html', text.decode('utf-8'))
return urls
def save_meta_to_sql(post_id, title, author):
cur.execute('INSERT INTO post (id, author, title) VALUES (?, ?, ?)', [post_id, title, author])
con.commit()
def save_content_as_file(post_id, post):
with open(post_id, 'w') as fd:
fd.write(post)
def parse_author_and_title(post):
parsed_post = BeautifulSoup(post)
post_meta = parsed_post.find_all(class_='article-meta-value')
post_meta = [meta.text for meta in post_meta]
author , _, title, __ = post_meta
return author, title
def save_post(text):
post_id = str(uuid.uuid1())
author, title = parse_author_and_title(text)
save_content_as_file(post_id, text)
save_meta_to_sql(post_id, author, title)
def craw_post_content(post_url):
full_url = f'{root_url}{post_url[4:]}'
res = requests.get(full_url, cookies=COOKIES)
text = res.content
save_post(text.decode('utf-8'))
def main():
for page in range(MAX_PAGE):
urls = get_post_urls(page)
for url in urls:
craw_post_content(url)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment