Skip to content

Instantly share code, notes, and snippets.

@mwicat
Created February 5, 2024 21:28
Show Gist options
  • Save mwicat/8ef974c1c4e6d3f0d1dee6bb37071127 to your computer and use it in GitHub Desktop.
Save mwicat/8ef974c1c4e6d3f0d1dee6bb37071127 to your computer and use it in GitHub Desktop.
"""
Script that takes patent url from Google Patents and creates nice little HTML table with patent metadata, then opens it
with your default browser. From there, you can copy the table to the clipboard and paste it in spreadsheet, nicely
formatted and parsed as a proper table. Apart from spreadsheet, any program that can process HTML tables from clipboard
will also work.
URL formats:
- https://patents.google.com/patent/[PATENT ID]
- https://patentimages.storage.googleapis.com/.../[PATENT ID].pdf
Examples:
- https://patents.google.com/patent/US3798032
- https://patentimages.storage.googleapis.com/21/6e/08/1fc5322ac7d593/US3798032.pdf
"""
import os
import logging
import tempfile
from pathlib import Path
import webbrowser
import requests
import bs4
logging.basicConfig(level=logging.INFO)
HTML_TPL = '''
<table border="1" style="font-family: arial">
<tr>
<td>!</td>
<td><a href="{url}">{title}</a></td>
<td>patent</td>
<td>{patent_num}</td>
<td>{date_issued}</td>
<td>{date_submitted}</td>
<td>{assignee}</td>
</tr>
</table>
'''
s = requests.Session()
def get_patent_data(url):
def get_meta(filter):
el = soup.select_one(f'meta{filter}')
if el is None:
return ''
return el.attrs['content']
if not url.startswith('https://patents.google.com/patent/'):
basename = os.path.basename(url)
patent_num = os.path.splitext(basename)[0]
patent_url = f'https://patents.google.com/patent/{patent_num}'
else:
patent_url = url
logging.info('Final url: %s', patent_url)
html = s.get(patent_url).content
soup = bs4.BeautifulSoup(html, 'html.parser')
title = get_meta('[name="DC.title"]')
date_issued = get_meta('[name="DC.date"][scheme="issue"]').split('-')[0]
date_submitted = get_meta('[name="DC.date"][scheme="dateSubmitted"]').split('-')[0]
assignee = get_meta('[name="DC.contributor"][scheme="assignee"]')
patent_num = get_meta('[name="citation_patent_number"]').replace(':', '')
patent_data = dict(
title=title,
date_issued=date_issued,
date_submitted=date_submitted,
assignee=assignee,
url=patent_url,
patent_num=patent_num
)
return patent_data
if __name__ == '__main__':
url = 'https://patents.google.com/patent/US3798032'
patent_data = get_patent_data(url)
tmpdir = tempfile.gettempdir()
html_path = Path(tmpdir) / 'patent.html'
with open(html_path, 'w') as tmpf:
html_out = HTML_TPL.format(**patent_data)
tmpf.write(html_out)
logging.info('Output path: %s', html_path)
webbrowser.open(html_path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment