Created
February 5, 2024 21:28
-
-
Save mwicat/8ef974c1c4e6d3f0d1dee6bb37071127 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script that takes patent url from Google Patents and creates nice little HTML table with patent metadata, then opens it | |
with your default browser. From there, you can copy the table to the clipboard and paste it in spreadsheet, nicely | |
formatted and parsed as a proper table. Apart from spreadsheet, any program that can process HTML tables from clipboard | |
will also work. | |
URL formats: | |
- https://patents.google.com/patent/[PATENT ID] | |
- https://patentimages.storage.googleapis.com/.../[PATENT ID].pdf | |
Examples: | |
- https://patents.google.com/patent/US3798032 | |
- https://patentimages.storage.googleapis.com/21/6e/08/1fc5322ac7d593/US3798032.pdf | |
""" | |
import os | |
import logging | |
import tempfile | |
from pathlib import Path | |
import webbrowser | |
import requests | |
import bs4 | |
logging.basicConfig(level=logging.INFO) | |
HTML_TPL = ''' | |
<table border="1" style="font-family: arial"> | |
<tr> | |
<td>!</td> | |
<td><a href="{url}">{title}</a></td> | |
<td>patent</td> | |
<td>{patent_num}</td> | |
<td>{date_issued}</td> | |
<td>{date_submitted}</td> | |
<td>{assignee}</td> | |
</tr> | |
</table> | |
''' | |
s = requests.Session() | |
def get_patent_data(url): | |
def get_meta(filter): | |
el = soup.select_one(f'meta{filter}') | |
if el is None: | |
return '' | |
return el.attrs['content'] | |
if not url.startswith('https://patents.google.com/patent/'): | |
basename = os.path.basename(url) | |
patent_num = os.path.splitext(basename)[0] | |
patent_url = f'https://patents.google.com/patent/{patent_num}' | |
else: | |
patent_url = url | |
logging.info('Final url: %s', patent_url) | |
html = s.get(patent_url).content | |
soup = bs4.BeautifulSoup(html, 'html.parser') | |
title = get_meta('[name="DC.title"]') | |
date_issued = get_meta('[name="DC.date"][scheme="issue"]').split('-')[0] | |
date_submitted = get_meta('[name="DC.date"][scheme="dateSubmitted"]').split('-')[0] | |
assignee = get_meta('[name="DC.contributor"][scheme="assignee"]') | |
patent_num = get_meta('[name="citation_patent_number"]').replace(':', '') | |
patent_data = dict( | |
title=title, | |
date_issued=date_issued, | |
date_submitted=date_submitted, | |
assignee=assignee, | |
url=patent_url, | |
patent_num=patent_num | |
) | |
return patent_data | |
if __name__ == '__main__': | |
url = 'https://patents.google.com/patent/US3798032' | |
patent_data = get_patent_data(url) | |
tmpdir = tempfile.gettempdir() | |
html_path = Path(tmpdir) / 'patent.html' | |
with open(html_path, 'w') as tmpf: | |
html_out = HTML_TPL.format(**patent_data) | |
tmpf.write(html_out) | |
logging.info('Output path: %s', html_path) | |
webbrowser.open(html_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment