Skip to content

Instantly share code, notes, and snippets.

@pamelafox
Created December 19, 2024 21:07
Show Gist options
  • Save pamelafox/dfcdf0b9fe4436059993792b0db05783 to your computer and use it in GitHub Desktop.
Save pamelafox/dfcdf0b9fe4436059993792b0db05783 to your computer and use it in GitHub Desktop.
raghack_blogpost.py
import json
import os
import requests
from datetime import datetime
import urllib.parse
from bs4 import BeautifulSoup
from rich import print
import openai
from pydantic import BaseModel
from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider
from dotenv import load_dotenv
load_dotenv()
# Use Azure OpenAI to generate a markdown for the issue
azure_credential = AzureDeveloperCliCredential(tenant_id=os.getenv('AZURE_TENANT_ID'), process_timeout=60)
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
api_version="2024-08-01-preview",
azure_endpoint=f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com",
azure_ad_token_provider=token_provider,
)
def get_all_streams():
response = requests.get("https://github.com/microsoft/RAG_Hack?tab=readme-ov-file#stream-schedule")
soup = BeautifulSoup(response.text, 'html.parser')
# find all the tables
tables = soup.find_all('table')
# filter to only tables with a "day/time" column
schedule_tables = [table for table in tables if table.find('th', string="Day/Time")]
# make a list of the rows
rows = []
for table in schedule_tables:
for row in table.find_all('tr'):
# if there arent three td elements, skip
if len(row.find_all('td')) != 3:
continue
date_str = row.find_all('td')[0].text
topic_title = row.find_all('td')[1].find('a').text
reactor_link = row.find_all('td')[1].find('a')['href']
discussion_link = row.find_all('td')[2].find('a')['href']
rows.append({
'datetime': date_str,
'title': topic_title,
'reactor_link': reactor_link,
'discussion_link': discussion_link
})
# Save the rows as JSON to a file
with open('raghack_streams.json', 'w') as f:
json.dump(rows, f, indent=4)
def get_all_stream_resources():
# open file
with open('raghack_streams.json', 'r') as f:
streams = json.load(f)
for stream in streams:
update_stream_with_resources(stream)
with open('raghack_streams_with_resources.json', 'w') as f:
json.dump(streams, f, indent=4)
def update_stream_with_resources(stream):
# fetch the GitHub discussion link https://github.com/microsoft/RAG_Hack/discussions/24
response = requests.get(stream['discussion_link'])
soup = BeautifulSoup(response.text, 'html.parser')
# get timeline-comment
timeline_comment = soup.find('div', class_='timeline-comment').prettify()
class StreamLink(BaseModel):
link_title: str
link_url: str
class StreamResources(BaseModel):
aka_link: str
youtube_link: str
slides_ppt_link: str|None
slides_pdf_link: str|None
extra_links: list[StreamLink]
model_name = "gpt-4o"
completion = openai_client.beta.chat.completions.parse(
model=model_name,
messages=[
{"role": "system", "content": "Extract the information from the blog post"},
{"role": "user", "content": timeline_comment},
],
response_format=StreamResources,
)
output = completion.choices[0].message.parsed
stream['resources'] = output.model_dump()
def update_stream_with_description(stream):
response = requests.get(stream['reactor_link'])
soup = BeautifulSoup(response.text, 'html.parser')
try:
description = soup.find('div', attrs={'pw-id': 'description'}).text.strip()
stream['description'] = description
except AttributeError:
print(f"Could not find description for {stream['title']}")
def update_streams_with_descriptions():
with open('raghack_streams_with_resources.json', 'r') as f:
streams = json.load(f)
for stream in streams:
update_stream_with_description(stream)
with open('raghack_streams_with_resources_and_descriptions.json', 'w') as f:
json.dump(streams, f, indent=4)
def write_blog_post():
# open streams with resources
with open('raghack_streams_with_resources_and_descriptions.json', 'r') as f:
streams = json.load(f)
streams_html = ""
for stream in streams:
stream_html = f"<h3>{stream['title']}</h3>\n"
youtube_url = stream['resources']['youtube_link']
if youtube_url:
youtube_embed_url = youtube_url.replace('watch?v=', 'embed/')
youtube_url_encoded = urllib.parse.quote(youtube_url)
youtube_embed_url_encoded = urllib.parse.quote(youtube_embed_url)
youtube_id = youtube_url.split('v=')[1]
#stream_html += f'<li-video src="https://cdn.embedly.com/widgets/media.html?src={youtube_embed_url_encoded}&amp;display_name=YouTube&amp;url={youtube_url_encoded}&amp;image=http%3A%2F%2Fi.ytimg.com%2Fvi%{youtube_id}%2Fhqdefault.jpg&amp;type=text%2Fhtml&amp;schema=youtube" vid="{youtube_url}" external="url" thumbnail="" title="" align="center" size="large" layout="stretch" resized="false" caption="false"></li-video>'
stream_html += f'<a href="{youtube_url}"><img src="http://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg" alt="YouTube video" width="220"><br>📺 Watch YouTube recording</a>\n'
#stream_html += youtube_url + "\n"
description = stream.get('description', '')
description = description.replace('** Part of RAGHack, a free global hackathon to develop RAG applications. Join at https://aka.ms/raghack **', '')
stream_html += f"<p>{description}</p>\n"
# Add links
stream_html += "<p>🔗 Helpful links:</p>\n"
stream_html += "<ul>\n"
if stream['resources']['slides_ppt_link']:
stream_html += f"<li><a href='{stream['resources']['slides_ppt_link']}'>Slides (PPT)</a></li>\n"
if stream['resources']['slides_pdf_link']:
stream_html += f"<li><a href='{stream['resources']['slides_pdf_link']}'>Slides (PDF)</a></li>\n"
stream_html += f"<li><a href='{stream['discussion_link']}'>Discussion</a></li>\n"
for link in stream['resources']['extra_links']:
stream_html += f"<li><a href='{link['link_url']}'>{link['link_title']}</a></li>\n"
stream_html += "</ul>\n\n"
streams_html += stream_html
with open('raghack_blogpost.html', 'w') as f:
f.write(streams_html)
if __name__ == "__main__":
write_blog_post()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment