Created
December 19, 2024 21:07
-
-
Save pamelafox/dfcdf0b9fe4436059993792b0db05783 to your computer and use it in GitHub Desktop.
raghack_blogpost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import requests | |
from datetime import datetime | |
import urllib.parse | |
from bs4 import BeautifulSoup | |
from rich import print | |
import openai | |
from pydantic import BaseModel | |
from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider | |
from dotenv import load_dotenv | |
load_dotenv() | |
# Use Azure OpenAI to generate a markdown for the issue | |
azure_credential = AzureDeveloperCliCredential(tenant_id=os.getenv('AZURE_TENANT_ID'), process_timeout=60) | |
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") | |
openai_client = openai.AzureOpenAI( | |
api_version="2024-08-01-preview", | |
azure_endpoint=f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com", | |
azure_ad_token_provider=token_provider, | |
) | |
def get_all_streams(): | |
response = requests.get("https://github.com/microsoft/RAG_Hack?tab=readme-ov-file#stream-schedule") | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# find all the tables | |
tables = soup.find_all('table') | |
# filter to only tables with a "day/time" column | |
schedule_tables = [table for table in tables if table.find('th', string="Day/Time")] | |
# make a list of the rows | |
rows = [] | |
for table in schedule_tables: | |
for row in table.find_all('tr'): | |
# if there arent three td elements, skip | |
if len(row.find_all('td')) != 3: | |
continue | |
date_str = row.find_all('td')[0].text | |
topic_title = row.find_all('td')[1].find('a').text | |
reactor_link = row.find_all('td')[1].find('a')['href'] | |
discussion_link = row.find_all('td')[2].find('a')['href'] | |
rows.append({ | |
'datetime': date_str, | |
'title': topic_title, | |
'reactor_link': reactor_link, | |
'discussion_link': discussion_link | |
}) | |
# Save the rows as JSON to a file | |
with open('raghack_streams.json', 'w') as f: | |
json.dump(rows, f, indent=4) | |
def get_all_stream_resources(): | |
# open file | |
with open('raghack_streams.json', 'r') as f: | |
streams = json.load(f) | |
for stream in streams: | |
update_stream_with_resources(stream) | |
with open('raghack_streams_with_resources.json', 'w') as f: | |
json.dump(streams, f, indent=4) | |
def update_stream_with_resources(stream): | |
# fetch the GitHub discussion link https://github.com/microsoft/RAG_Hack/discussions/24 | |
response = requests.get(stream['discussion_link']) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# get timeline-comment | |
timeline_comment = soup.find('div', class_='timeline-comment').prettify() | |
class StreamLink(BaseModel): | |
link_title: str | |
link_url: str | |
class StreamResources(BaseModel): | |
aka_link: str | |
youtube_link: str | |
slides_ppt_link: str|None | |
slides_pdf_link: str|None | |
extra_links: list[StreamLink] | |
model_name = "gpt-4o" | |
completion = openai_client.beta.chat.completions.parse( | |
model=model_name, | |
messages=[ | |
{"role": "system", "content": "Extract the information from the blog post"}, | |
{"role": "user", "content": timeline_comment}, | |
], | |
response_format=StreamResources, | |
) | |
output = completion.choices[0].message.parsed | |
stream['resources'] = output.model_dump() | |
def update_stream_with_description(stream): | |
response = requests.get(stream['reactor_link']) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
try: | |
description = soup.find('div', attrs={'pw-id': 'description'}).text.strip() | |
stream['description'] = description | |
except AttributeError: | |
print(f"Could not find description for {stream['title']}") | |
def update_streams_with_descriptions(): | |
with open('raghack_streams_with_resources.json', 'r') as f: | |
streams = json.load(f) | |
for stream in streams: | |
update_stream_with_description(stream) | |
with open('raghack_streams_with_resources_and_descriptions.json', 'w') as f: | |
json.dump(streams, f, indent=4) | |
def write_blog_post(): | |
# open streams with resources | |
with open('raghack_streams_with_resources_and_descriptions.json', 'r') as f: | |
streams = json.load(f) | |
streams_html = "" | |
for stream in streams: | |
stream_html = f"<h3>{stream['title']}</h3>\n" | |
youtube_url = stream['resources']['youtube_link'] | |
if youtube_url: | |
youtube_embed_url = youtube_url.replace('watch?v=', 'embed/') | |
youtube_url_encoded = urllib.parse.quote(youtube_url) | |
youtube_embed_url_encoded = urllib.parse.quote(youtube_embed_url) | |
youtube_id = youtube_url.split('v=')[1] | |
#stream_html += f'<li-video src="https://cdn.embedly.com/widgets/media.html?src={youtube_embed_url_encoded}&display_name=YouTube&url={youtube_url_encoded}&image=http%3A%2F%2Fi.ytimg.com%2Fvi%{youtube_id}%2Fhqdefault.jpg&type=text%2Fhtml&schema=youtube" vid="{youtube_url}" external="url" thumbnail="" title="" align="center" size="large" layout="stretch" resized="false" caption="false"></li-video>' | |
stream_html += f'<a href="{youtube_url}"><img src="http://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg" alt="YouTube video" width="220"><br>📺 Watch YouTube recording</a>\n' | |
#stream_html += youtube_url + "\n" | |
description = stream.get('description', '') | |
description = description.replace('** Part of RAGHack, a free global hackathon to develop RAG applications. Join at https://aka.ms/raghack **', '') | |
stream_html += f"<p>{description}</p>\n" | |
# Add links | |
stream_html += "<p>🔗 Helpful links:</p>\n" | |
stream_html += "<ul>\n" | |
if stream['resources']['slides_ppt_link']: | |
stream_html += f"<li><a href='{stream['resources']['slides_ppt_link']}'>Slides (PPT)</a></li>\n" | |
if stream['resources']['slides_pdf_link']: | |
stream_html += f"<li><a href='{stream['resources']['slides_pdf_link']}'>Slides (PDF)</a></li>\n" | |
stream_html += f"<li><a href='{stream['discussion_link']}'>Discussion</a></li>\n" | |
for link in stream['resources']['extra_links']: | |
stream_html += f"<li><a href='{link['link_url']}'>{link['link_title']}</a></li>\n" | |
stream_html += "</ul>\n\n" | |
streams_html += stream_html | |
with open('raghack_blogpost.html', 'w') as f: | |
f.write(streams_html) | |
if __name__ == "__main__": | |
write_blog_post() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment