Created
February 14, 2021 23:52
-
-
Save ercas/ddd251bc49fc60aff94561b399963bb7 to your computer and use it in GitHub Desktop.
space email scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 | |
import dateutil.parser | |
import peewee | |
import requests | |
import tqdm | |
POST_PAGE = "https://space.galaxybuster.net/lib/view.php" | |
LAST_ID = 333377 | |
db = peewee.SqliteDatabase("space-email.db") | |
class Post(peewee.Model): | |
id = peewee.IntegerField(index=True, unique=True, primary_key=True) | |
subject = peewee.CharField() | |
sender = peewee.CharField() | |
body = peewee.TextField() | |
date = peewee.DateTimeField() | |
class Meta: | |
database = db | |
def get_post(id: int) -> Post: | |
response = requests.post(POST_PAGE, data={"id": id}) | |
post_html = bs4.BeautifulSoup(response.json()[0], "lxml") | |
return Post.create( | |
id=id, | |
subject=post_html.select_one("#msgSubject").text.strip(), | |
sender=post_html.select_one("#msgSender").text.strip(), | |
body=post_html.select_one("#msgBody").text.strip().replace("\r", "\n"), | |
date=dateutil.parser.parse(post_html.select_one("#msgDate").text) | |
) | |
if __name__ == "__main__": | |
db.connect() | |
db.create_tables([Post]) | |
db.commit() | |
latest_post = Post.select( | |
peewee.fn.max( | |
Post.id | |
) | |
).scalar() | |
if latest_post is None: | |
latest_post = 0 | |
print("skipping {} existing posts".format(latest_post)) | |
for id in tqdm.tqdm( | |
range(latest_post + 1, LAST_ID), | |
desc="scraping posts", | |
unit=" posts" | |
): | |
get_post(id).save() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment