Last active
October 26, 2024 13:59
-
-
Save vzakharov/50ce3265a019eacf7ce9c82e00be442d to your computer and use it in GitHub Desktop.
snippet from actual code to get og tags from Suno. Doesn’t include the Player model, but you’ll get the gist (lol)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from typing import cast | |
import requests | |
from bs4 import BeautifulSoup, Tag | |
from elo.models import Player | |
from rand.errors import BadRequestError | |
from .models import Player | |
def parse_og_tags(url: str, soup: BeautifulSoup | None = None): | |
soup = soup or get_soup(url) | |
def parse_tag(tag: Tag): | |
property = tag.get('property') | |
content = tag.get('content') | |
if isinstance(property, str) and isinstance(content, str): | |
return (property.replace('og:', ''), content) | |
return cast( | |
dict[str, str| None], | |
{ | |
parsed_tag[0]: parsed_tag[1] | |
for parsed_tag in map(parse_tag, soup.find_all('meta')) | |
if parsed_tag | |
} | |
) or None | |
def get_soup(url: str): | |
return BeautifulSoup(requests.get(url).text, 'html.parser') | |
def update_player_og_tags(player: Player, og_tags: dict[str, str | None], ignore_parsed_fields: list[str] = []): | |
map = dict( | |
name=['title'], | |
description=['description'], | |
image_url=['image'], | |
audio_url=['audio', 'audio:url'] | |
) | |
for field, tags in map.items(): | |
if field in ignore_parsed_fields: | |
continue | |
for tag in tags: | |
value = og_tags.get(tag) | |
if value and not getattr(player, field): | |
setattr(player, field, value) | |
def parse_url(player: Player): | |
if not player.url: | |
raise BadRequestError('Player does not have a URL') | |
soup = get_soup(player.url) | |
og_tags = parse_og_tags(player.url, soup) | |
ignore_parsed_fields = player.project.ignore_parsed_fields | |
if og_tags: | |
update_player_og_tags(player, og_tags, ignore_parsed_fields) | |
regexes = player.project.parsing_regexes | |
html = str(soup) | |
if regexes: | |
for field, regex in player.project.parsing_regexes.items(): | |
if field in ignore_parsed_fields: | |
continue | |
match = re.search(regex, html) | |
if match: | |
setattr(player, field, match.group(1)) | |
selectors = player.project.parsing_selectors | |
if selectors: | |
for field, selector in selectors.items(): | |
if field in ignore_parsed_fields: | |
continue | |
element = soup.select_one(selector['selector']) | |
if element: | |
attribute = selector.get('attribute') | |
value = element.get(attribute) if attribute else element.text | |
setattr(player, field, value) | |
player.url_parsed = True | |
player.save() | |
return player.data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment