Skip to content

Instantly share code, notes, and snippets.

@sithumonline
Created September 17, 2024 09:27
Show Gist options
  • Select an option

  • Save sithumonline/bd9937e7ed48189d7c4fca11fc8d6b49 to your computer and use it in GitHub Desktop.

Select an option

Save sithumonline/bd9937e7ed48189d7c4fca11fc8d6b49 to your computer and use it in GitHub Desktop.
jina.ai reader-lm
#https://colab.research.google.com/drive/1wXWyj5hOxEHY6WeHbOwEzYAC0WB1I5uA?usp=sharing#scrollTo=lHBHjlwgQesA
from vllm import SamplingParams
from vllm import LLM
import re
import requests
from IPython.display import display, Markdown
from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment
import gc
import os
import torch
model_name = 'jinaai/reader-lm-1.5b'
top_k = 1
temperature = 0
repetition_penalty = 1.08
presence_penalty = 0.25
max_tokens = 1024
sampling_params = SamplingParams(
temperature=temperature,
top_k=top_k,
presence_penalty=presence_penalty,
repetition_penalty=repetition_penalty,
max_tokens=max_tokens,
)
print('sampling_params', sampling_params)
#--------------------------
llm = LLM(model=model_name, dtype='float16')
#--------------------------
def display_header(text):
display(Markdown(f'**{text}**'))
def display_rendered_md(text):
# for mimic "Reading mode" in Safari/Firefox
display(Markdown(text))
def display_content(text):
display(Markdown(f'```\n{text}\n```'))
def get_html_content(url):
api_url = f'https://r.jina.ai/{url}'
headers = {'X-Return-Format': 'html'}
try:
response = requests.get(api_url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
return f"error: {str(e)}"
def get_html_content(url):
api_url = f'https://r.jina.ai/{url}'
headers = {'X-Return-Format': 'html'}
try:
response = requests.get(api_url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except requests.exceptions.RequestException as e:
return f"error: {str(e)}"
def create_prompt(text:str, tokenizer) -> str:
messages = [
{
"role": "user",
"content": text
},
]
return tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
# (REMOVE <SCRIPT> to </script> and variations)
SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML <STYLE> to </style> and variations)
STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML <META> to </meta> and variations)
META_PATTERN = r'<[ ]*meta.*?>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML COMMENTS <!-- to --> and variations)
COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>' # mach any char zero or more times
# text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
# (REMOVE HTML LINK <LINK> to </link> and variations)
LINK_PATTERN = r'<[ ]*link.*?>' # mach any char zero or more times
# (REPLACE base64 images)
BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>'
# (REPLACE <svg> to </svg> and variations)
SVG_PATTERN = r'(<svg[^>]*>)(.*?)(<\/svg>)'
def replace_svg(html: str, new_content: str = "this is a placeholder") -> str:
return re.sub(
SVG_PATTERN,
lambda match: f"{match.group(1)}{new_content}{match.group(3)}",
html,
flags=re.DOTALL,
)
def replace_base64_images(html: str, new_image_src: str = "#") -> str:
return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html)
def has_base64_images(text: str) -> bool:
base64_content_pattern = r'data:image/[^;]+;base64,[^"]+'
return bool(re.search(base64_content_pattern, text, flags=re.DOTALL))
def has_svg_components(text: str) -> bool:
return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL))
def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False):
html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL))
if clean_svg:
html = replace_svg(html)
if clean_base64:
html = replace_base64_images(html)
return html
url = "https://www.hackernews.com"
print(f'We will use Jina Reader to fetch the **raw HTML** from: {url}')
#--------------------------
html = get_html_content(url)
html = clean_html(html, clean_svg=True, clean_base64=True)
prompt = create_prompt(html, llm.get_tokenizer())
results = llm.generate(prompt, sampling_params=sampling_params)
for output in results:
prompt = output.prompt
generated_text = output.outputs[0].text
display_content(generated_text)
#--------------------------
destroy_model_parallel()
destroy_distributed_environment()
del llm.llm_engine.model_executor.driver_worker
del llm.llm_engine.model_executor
del llm
gc.collect()
torch.cuda.empty_cache()
print(f"cuda memory: {torch.cuda.memory_allocated() // 1024 // 1024}MB")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment