pamelafox · December 19, 2024 21:07
diff --git a/raghack_blogpost.py b/raghack_blogpost.py
 import json
 import os
 import requests
 from datetime import datetime
 import urllib.parse
 from bs4 import BeautifulSoup
 from rich import print
 import openai
 from pydantic import BaseModel
 from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider
 from dotenv import load_dotenv

 load_dotenv()

 # Use Azure OpenAI to generate a markdown for the issue
 azure_credential = AzureDeveloperCliCredential(tenant_id=os.getenv('AZURE_TENANT_ID'), process_timeout=60)
 token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")

 openai_client = openai.AzureOpenAI(
    api_version="2024-08-01-preview",
    azure_endpoint=f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com",
    azure_ad_token_provider=token_provider,
 )


 def get_all_streams():
    response = requests.get("https://github.com/microsoft/RAG_Hack?tab=readme-ov-file#stream-schedule")
    soup = BeautifulSoup(response.text, 'html.parser')

    # find all the tables
    tables = soup.find_all('table')
    # filter to only tables with a "day/time" column
    schedule_tables = [table for table in tables if table.find('th', string="Day/Time")]

    # make a list of the rows
    rows = []
    for table in schedule_tables:
        for row in table.find_all('tr'):
            # if there arent three td elements, skip
            if len(row.find_all('td')) != 3:
                continue
            date_str = row.find_all('td')[0].text
            topic_title = row.find_all('td')[1].find('a').text
            reactor_link = row.find_all('td')[1].find('a')['href']            
            discussion_link = row.find_all('td')[2].find('a')['href']
            rows.append({
                'datetime': date_str,
                'title': topic_title,
                'reactor_link': reactor_link,
                'discussion_link': discussion_link
            })
    # Save the rows as JSON to a file
    with open('raghack_streams.json', 'w') as f:
        json.dump(rows, f, indent=4)

 def get_all_stream_resources():
    # open file
    with open('raghack_streams.json', 'r') as f:
        streams = json.load(f)
    for stream in streams:
        update_stream_with_resources(stream)
    with open('raghack_streams_with_resources.json', 'w') as f:
        json.dump(streams, f, indent=4)

 def update_stream_with_resources(stream):
    # fetch the GitHub discussion link https://github.com/microsoft/RAG_Hack/discussions/24
    response = requests.get(stream['discussion_link'])
    soup = BeautifulSoup(response.text, 'html.parser')
    # get timeline-comment
    timeline_comment = soup.find('div', class_='timeline-comment').prettify()

    class StreamLink(BaseModel):
        link_title: str
        link_url: str

    class StreamResources(BaseModel):
        aka_link: str
        youtube_link: str
        slides_ppt_link: str|None
        slides_pdf_link: str|None
        extra_links: list[StreamLink]
    

    model_name = "gpt-4o"
    completion = openai_client.beta.chat.completions.parse(
        model=model_name,
        messages=[
            {"role": "system", "content": "Extract the information from the blog post"},
            {"role": "user", "content": timeline_comment},
        ],
        response_format=StreamResources,
    )

    output = completion.choices[0].message.parsed
    stream['resources'] = output.model_dump()

 def update_stream_with_description(stream):
    response = requests.get(stream['reactor_link'])
    soup = BeautifulSoup(response.text, 'html.parser')
    try:
        description = soup.find('div', attrs={'pw-id': 'description'}).text.strip()
        stream['description'] = description
    except AttributeError:
        print(f"Could not find description for {stream['title']}")

 def update_streams_with_descriptions():
    with open('raghack_streams_with_resources.json', 'r') as f:
        streams = json.load(f)
    for stream in streams:
        update_stream_with_description(stream)
    with open('raghack_streams_with_resources_and_descriptions.json', 'w') as f:
        json.dump(streams, f, indent=4)
    
 def write_blog_post():
    # open streams with resources
    with open('raghack_streams_with_resources_and_descriptions.json', 'r') as f:
        streams = json.load(f)

    streams_html = ""
    for stream in streams:
        stream_html = f"<h3>{stream['title']}</h3>\n"

        youtube_url = stream['resources']['youtube_link']
        if youtube_url:
            youtube_embed_url = youtube_url.replace('watch?v=', 'embed/')
            youtube_url_encoded = urllib.parse.quote(youtube_url)
            youtube_embed_url_encoded = urllib.parse.quote(youtube_embed_url)
            youtube_id = youtube_url.split('v=')[1]
            #stream_html += f'<li-video src="https://cdn.embedly.com/widgets/media.html?src={youtube_embed_url_encoded}&amp;display_name=YouTube&amp;url={youtube_url_encoded}&amp;image=http%3A%2F%2Fi.ytimg.com%2Fvi%{youtube_id}%2Fhqdefault.jpg&amp;type=text%2Fhtml&amp;schema=youtube" vid="{youtube_url}" external="url" thumbnail="" title="" align="center" size="large" layout="stretch" resized="false" caption="false"></li-video>'
            stream_html += f'<a href="{youtube_url}"><img src="http://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg" alt="YouTube video" width="220"><br>📺 Watch YouTube recording</a>\n'
            #stream_html += youtube_url + "\n"
        description = stream.get('description', '')
        description = description.replace('** Part of RAGHack, a free global hackathon to develop RAG applications. Join at https://aka.ms/raghack **', '')
        stream_html += f"<p>{description}</p>\n"
        # Add links
        stream_html += "<p>🔗 Helpful links:</p>\n"
        stream_html += "<ul>\n"
        if stream['resources']['slides_ppt_link']:
            stream_html += f"<li><a href='{stream['resources']['slides_ppt_link']}'>Slides (PPT)</a></li>\n"
        if stream['resources']['slides_pdf_link']:
            stream_html += f"<li><a href='{stream['resources']['slides_pdf_link']}'>Slides (PDF)</a></li>\n"
        stream_html += f"<li><a href='{stream['discussion_link']}'>Discussion</a></li>\n"
        for link in stream['resources']['extra_links']:
            stream_html += f"<li><a href='{link['link_url']}'>{link['link_title']}</a></li>\n"
        stream_html += "</ul>\n\n"
        streams_html += stream_html
    with open('raghack_blogpost.html', 'w') as f:
        f.write(streams_html)
    

 if __name__ == "__main__":
    write_blog_post()
	import json
	import os
	import requests
	from datetime import datetime
	import urllib.parse
	from bs4 import BeautifulSoup
	from rich import print
	import openai
	from pydantic import BaseModel
	from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider
	from dotenv import load_dotenv

	load_dotenv()

	# Use Azure OpenAI to generate a markdown for the issue
	azure_credential = AzureDeveloperCliCredential(tenant_id=os.getenv('AZURE_TENANT_ID'), process_timeout=60)
	token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")

	openai_client = openai.AzureOpenAI(
	api_version="2024-08-01-preview",
	azure_endpoint=f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com",
	azure_ad_token_provider=token_provider,
	)


	def get_all_streams():
	response = requests.get("https://github.com/microsoft/RAG_Hack?tab=readme-ov-file#stream-schedule")
	soup = BeautifulSoup(response.text, 'html.parser')

	# find all the tables
	tables = soup.find_all('table')
	# filter to only tables with a "day/time" column
	schedule_tables = [table for table in tables if table.find('th', string="Day/Time")]

	# make a list of the rows
	rows = []
	for table in schedule_tables:
	for row in table.find_all('tr'):
	# if there arent three td elements, skip
	if len(row.find_all('td')) != 3:
	continue
	date_str = row.find_all('td')[0].text
	topic_title = row.find_all('td')[1].find('a').text
	reactor_link = row.find_all('td')[1].find('a')['href']
	discussion_link = row.find_all('td')[2].find('a')['href']
	rows.append({
	'datetime': date_str,
	'title': topic_title,
	'reactor_link': reactor_link,
	'discussion_link': discussion_link
	})
	# Save the rows as JSON to a file
	with open('raghack_streams.json', 'w') as f:
	json.dump(rows, f, indent=4)

	def get_all_stream_resources():
	# open file
	with open('raghack_streams.json', 'r') as f:
	streams = json.load(f)
	for stream in streams:
	update_stream_with_resources(stream)
	with open('raghack_streams_with_resources.json', 'w') as f:
	json.dump(streams, f, indent=4)

	def update_stream_with_resources(stream):
	# fetch the GitHub discussion link https://github.com/microsoft/RAG_Hack/discussions/24
	response = requests.get(stream['discussion_link'])
	soup = BeautifulSoup(response.text, 'html.parser')
	# get timeline-comment
	timeline_comment = soup.find('div', class_='timeline-comment').prettify()

	class StreamLink(BaseModel):
	link_title: str
	link_url: str

	class StreamResources(BaseModel):
	aka_link: str
	youtube_link: str
	slides_ppt_link: str\|None
	slides_pdf_link: str\|None
	extra_links: list[StreamLink]


	model_name = "gpt-4o"
	completion = openai_client.beta.chat.completions.parse(
	model=model_name,
	messages=[
	{"role": "system", "content": "Extract the information from the blog post"},
	{"role": "user", "content": timeline_comment},
	],
	response_format=StreamResources,
	)

	output = completion.choices[0].message.parsed
	stream['resources'] = output.model_dump()

	def update_stream_with_description(stream):
	response = requests.get(stream['reactor_link'])
	soup = BeautifulSoup(response.text, 'html.parser')
	try:
	description = soup.find('div', attrs={'pw-id': 'description'}).text.strip()
	stream['description'] = description
	except AttributeError:
	print(f"Could not find description for {stream['title']}")

	def update_streams_with_descriptions():
	with open('raghack_streams_with_resources.json', 'r') as f:
	streams = json.load(f)
	for stream in streams:
	update_stream_with_description(stream)
	with open('raghack_streams_with_resources_and_descriptions.json', 'w') as f:
	json.dump(streams, f, indent=4)

	def write_blog_post():
	# open streams with resources
	with open('raghack_streams_with_resources_and_descriptions.json', 'r') as f:
	streams = json.load(f)

	streams_html = ""
	for stream in streams:
	stream_html = f"<h3>{stream['title']}</h3>\n"

	youtube_url = stream['resources']['youtube_link']
	if youtube_url:
	youtube_embed_url = youtube_url.replace('watch?v=', 'embed/')
	youtube_url_encoded = urllib.parse.quote(youtube_url)
	youtube_embed_url_encoded = urllib.parse.quote(youtube_embed_url)
	youtube_id = youtube_url.split('v=')[1]
	#stream_html += f'<li-video src="https://cdn.embedly.com/widgets/media.html?src={youtube_embed_url_encoded}&display_name=YouTube&url={youtube_url_encoded}&image=http%3A%2F%2Fi.ytimg.com%2Fvi%{youtube_id}%2Fhqdefault.jpg&type=text%2Fhtml&schema=youtube" vid="{youtube_url}" external="url" thumbnail="" title="" align="center" size="large" layout="stretch" resized="false" caption="false"></li-video>'
	stream_html += f'<a href="{youtube_url}"><img src="http://i.ytimg.com/vi/{youtube_id}/hqdefault.jpg" alt="YouTube video" width="220"><br>📺 Watch YouTube recording</a>\n'
	#stream_html += youtube_url + "\n"
	description = stream.get('description', '')
	description = description.replace(' Part of RAGHack, a free global hackathon to develop RAG applications. Join at https://aka.ms/raghack ', '')
	stream_html += f"<p>{description}</p>\n"
	# Add links
	stream_html += "<p>🔗 Helpful links:</p>\n"
	stream_html += "<ul>\n"
	if stream['resources']['slides_ppt_link']:
	stream_html += f"<li><a href='{stream['resources']['slides_ppt_link']}'>Slides (PPT)</a></li>\n"
	if stream['resources']['slides_pdf_link']:
	stream_html += f"<li><a href='{stream['resources']['slides_pdf_link']}'>Slides (PDF)</a></li>\n"
	stream_html += f"<li><a href='{stream['discussion_link']}'>Discussion</a></li>\n"
	for link in stream['resources']['extra_links']:
	stream_html += f"<li><a href='{link['link_url']}'>{link['link_title']}</a></li>\n"
	stream_html += "</ul>\n\n"
	streams_html += stream_html
	with open('raghack_blogpost.html', 'w') as f:
	f.write(streams_html)


	if __name__ == "__main__":
	write_blog_post()