Skip to content

Instantly share code, notes, and snippets.

@ercas
Created February 14, 2021 23:52
Show Gist options
  • Save ercas/ddd251bc49fc60aff94561b399963bb7 to your computer and use it in GitHub Desktop.
Save ercas/ddd251bc49fc60aff94561b399963bb7 to your computer and use it in GitHub Desktop.
space email scraper
import bs4
import dateutil.parser
import peewee
import requests
import tqdm
POST_PAGE = "https://space.galaxybuster.net/lib/view.php"
LAST_ID = 333377
db = peewee.SqliteDatabase("space-email.db")
class Post(peewee.Model):
id = peewee.IntegerField(index=True, unique=True, primary_key=True)
subject = peewee.CharField()
sender = peewee.CharField()
body = peewee.TextField()
date = peewee.DateTimeField()
class Meta:
database = db
def get_post(id: int) -> Post:
response = requests.post(POST_PAGE, data={"id": id})
post_html = bs4.BeautifulSoup(response.json()[0], "lxml")
return Post.create(
id=id,
subject=post_html.select_one("#msgSubject").text.strip(),
sender=post_html.select_one("#msgSender").text.strip(),
body=post_html.select_one("#msgBody").text.strip().replace("\r", "\n"),
date=dateutil.parser.parse(post_html.select_one("#msgDate").text)
)
if __name__ == "__main__":
db.connect()
db.create_tables([Post])
db.commit()
latest_post = Post.select(
peewee.fn.max(
Post.id
)
).scalar()
if latest_post is None:
latest_post = 0
print("skipping {} existing posts".format(latest_post))
for id in tqdm.tqdm(
range(latest_post + 1, LAST_ID),
desc="scraping posts",
unit=" posts"
):
get_post(id).save()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment