Created
September 17, 2024 09:27
-
-
Save sithumonline/bd9937e7ed48189d7c4fca11fc8d6b49 to your computer and use it in GitHub Desktop.
jina.ai reader-lm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #https://colab.research.google.com/drive/1wXWyj5hOxEHY6WeHbOwEzYAC0WB1I5uA?usp=sharing#scrollTo=lHBHjlwgQesA | |
| from vllm import SamplingParams | |
| from vllm import LLM | |
| import re | |
| import requests | |
| from IPython.display import display, Markdown | |
| from vllm.distributed.parallel_state import destroy_model_parallel, destroy_distributed_environment | |
| import gc | |
| import os | |
| import torch | |
| model_name = 'jinaai/reader-lm-1.5b' | |
| top_k = 1 | |
| temperature = 0 | |
| repetition_penalty = 1.08 | |
| presence_penalty = 0.25 | |
| max_tokens = 1024 | |
| sampling_params = SamplingParams( | |
| temperature=temperature, | |
| top_k=top_k, | |
| presence_penalty=presence_penalty, | |
| repetition_penalty=repetition_penalty, | |
| max_tokens=max_tokens, | |
| ) | |
| print('sampling_params', sampling_params) | |
| #-------------------------- | |
| llm = LLM(model=model_name, dtype='float16') | |
| #-------------------------- | |
| def display_header(text): | |
| display(Markdown(f'**{text}**')) | |
| def display_rendered_md(text): | |
| # for mimic "Reading mode" in Safari/Firefox | |
| display(Markdown(text)) | |
| def display_content(text): | |
| display(Markdown(f'```\n{text}\n```')) | |
| def get_html_content(url): | |
| api_url = f'https://r.jina.ai/{url}' | |
| headers = {'X-Return-Format': 'html'} | |
| try: | |
| response = requests.get(api_url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| return f"error: {str(e)}" | |
| def get_html_content(url): | |
| api_url = f'https://r.jina.ai/{url}' | |
| headers = {'X-Return-Format': 'html'} | |
| try: | |
| response = requests.get(api_url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| return response.text | |
| except requests.exceptions.RequestException as e: | |
| return f"error: {str(e)}" | |
| def create_prompt(text:str, tokenizer) -> str: | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": text | |
| }, | |
| ] | |
| return tokenizer.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| # (REMOVE <SCRIPT> to </script> and variations) | |
| SCRIPT_PATTERN = r'<[ ]*script.*?\/[ ]*script[ ]*>' # mach any char zero or more times | |
| # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| # (REMOVE HTML <STYLE> to </style> and variations) | |
| STYLE_PATTERN = r'<[ ]*style.*?\/[ ]*style[ ]*>' # mach any char zero or more times | |
| # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| # (REMOVE HTML <META> to </meta> and variations) | |
| META_PATTERN = r'<[ ]*meta.*?>' # mach any char zero or more times | |
| # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| # (REMOVE HTML COMMENTS <!-- to --> and variations) | |
| COMMENT_PATTERN = r'<[ ]*!--.*?--[ ]*>' # mach any char zero or more times | |
| # text = re.sub(pattern, '', text, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| # (REMOVE HTML LINK <LINK> to </link> and variations) | |
| LINK_PATTERN = r'<[ ]*link.*?>' # mach any char zero or more times | |
| # (REPLACE base64 images) | |
| BASE64_IMG_PATTERN = r'<img[^>]+src="data:image/[^;]+;base64,[^"]+"[^>]*>' | |
| # (REPLACE <svg> to </svg> and variations) | |
| SVG_PATTERN = r'(<svg[^>]*>)(.*?)(<\/svg>)' | |
| def replace_svg(html: str, new_content: str = "this is a placeholder") -> str: | |
| return re.sub( | |
| SVG_PATTERN, | |
| lambda match: f"{match.group(1)}{new_content}{match.group(3)}", | |
| html, | |
| flags=re.DOTALL, | |
| ) | |
| def replace_base64_images(html: str, new_image_src: str = "#") -> str: | |
| return re.sub(BASE64_IMG_PATTERN, f'<img src="{new_image_src}"/>', html) | |
| def has_base64_images(text: str) -> bool: | |
| base64_content_pattern = r'data:image/[^;]+;base64,[^"]+' | |
| return bool(re.search(base64_content_pattern, text, flags=re.DOTALL)) | |
| def has_svg_components(text: str) -> bool: | |
| return bool(re.search(SVG_PATTERN, text, flags=re.DOTALL)) | |
| def clean_html(html: str, clean_svg: bool = False, clean_base64: bool = False): | |
| html = re.sub(SCRIPT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| html = re.sub(STYLE_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| html = re.sub(META_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| html = re.sub(COMMENT_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| html = re.sub(LINK_PATTERN, '', html, flags=(re.IGNORECASE | re.MULTILINE | re.DOTALL)) | |
| if clean_svg: | |
| html = replace_svg(html) | |
| if clean_base64: | |
| html = replace_base64_images(html) | |
| return html | |
| url = "https://www.hackernews.com" | |
| print(f'We will use Jina Reader to fetch the **raw HTML** from: {url}') | |
| #-------------------------- | |
| html = get_html_content(url) | |
| html = clean_html(html, clean_svg=True, clean_base64=True) | |
| prompt = create_prompt(html, llm.get_tokenizer()) | |
| results = llm.generate(prompt, sampling_params=sampling_params) | |
| for output in results: | |
| prompt = output.prompt | |
| generated_text = output.outputs[0].text | |
| display_content(generated_text) | |
| #-------------------------- | |
| destroy_model_parallel() | |
| destroy_distributed_environment() | |
| del llm.llm_engine.model_executor.driver_worker | |
| del llm.llm_engine.model_executor | |
| del llm | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| print(f"cuda memory: {torch.cuda.memory_allocated() // 1024 // 1024}MB") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment