mwicat · February 5, 2024 21:28
diff --git a/patent2table.py b/patent2table.py
 """
 Script that takes patent url from Google Patents and creates nice little HTML table with patent metadata, then opens it
 with your default browser. From there, you can copy the table to the clipboard and paste it in spreadsheet, nicely
 formatted and parsed as a proper table. Apart from spreadsheet, any program that can process HTML tables from clipboard
 will also work.

 URL formats:
 - https://patents.google.com/patent/[PATENT ID]
 - https://patentimages.storage.googleapis.com/.../[PATENT ID].pdf

 Examples:
 - https://patents.google.com/patent/US3798032
 - https://patentimages.storage.googleapis.com/21/6e/08/1fc5322ac7d593/US3798032.pdf

 """

 import os
 import logging
 import tempfile
 from pathlib import Path
 import webbrowser

 import requests
 import bs4

 logging.basicConfig(level=logging.INFO)


 HTML_TPL = '''
 <table border="1" style="font-family: arial">
 <tr>
 <td>!</td>
 <td><a href="{url}">{title}</a></td>
 <td>patent</td>
 <td>{patent_num}</td>
 <td>{date_issued}</td>
 <td>{date_submitted}</td>
 <td>{assignee}</td>
 </tr>
 </table>
 '''

 s = requests.Session()


 def get_patent_data(url):
    def get_meta(filter):
        el = soup.select_one(f'meta{filter}')
        if el is None:
            return ''
        return el.attrs['content']

    if not url.startswith('https://patents.google.com/patent/'):
        basename = os.path.basename(url)
        patent_num = os.path.splitext(basename)[0]

        patent_url = f'https://patents.google.com/patent/{patent_num}'
    else:
        patent_url = url

    logging.info('Final url: %s', patent_url)

    html = s.get(patent_url).content
    soup = bs4.BeautifulSoup(html, 'html.parser')

    title = get_meta('[name="DC.title"]')
    date_issued = get_meta('[name="DC.date"][scheme="issue"]').split('-')[0]
    date_submitted = get_meta('[name="DC.date"][scheme="dateSubmitted"]').split('-')[0]
    assignee = get_meta('[name="DC.contributor"][scheme="assignee"]')
    patent_num = get_meta('[name="citation_patent_number"]').replace(':', '')

    patent_data = dict(
        title=title,
        date_issued=date_issued,
        date_submitted=date_submitted,
        assignee=assignee,
        url=patent_url,
        patent_num=patent_num
    )

    return patent_data


 if __name__ == '__main__':
    url = 'https://patents.google.com/patent/US3798032'

    patent_data = get_patent_data(url)

    tmpdir = tempfile.gettempdir()
    html_path = Path(tmpdir) / 'patent.html'

    with open(html_path, 'w') as tmpf:
        html_out = HTML_TPL.format(**patent_data)
        tmpf.write(html_out)

    logging.info('Output path: %s', html_path)

    webbrowser.open(html_path)
	"""
	Script that takes patent url from Google Patents and creates nice little HTML table with patent metadata, then opens it
	with your default browser. From there, you can copy the table to the clipboard and paste it in spreadsheet, nicely
	formatted and parsed as a proper table. Apart from spreadsheet, any program that can process HTML tables from clipboard
	will also work.

	URL formats:
	- https://patents.google.com/patent/[PATENT ID]
	- https://patentimages.storage.googleapis.com/.../[PATENT ID].pdf

	Examples:
	- https://patents.google.com/patent/US3798032
	- https://patentimages.storage.googleapis.com/21/6e/08/1fc5322ac7d593/US3798032.pdf

	"""

	import os
	import logging
	import tempfile
	from pathlib import Path
	import webbrowser

	import requests
	import bs4

	logging.basicConfig(level=logging.INFO)


	HTML_TPL = '''
	<table border="1" style="font-family: arial">
	<tr>
	<td>!</td>
	<td><a href="{url}">{title}</a></td>
	<td>patent</td>
	<td>{patent_num}</td>
	<td>{date_issued}</td>
	<td>{date_submitted}</td>
	<td>{assignee}</td>
	</tr>
	</table>
	'''

	s = requests.Session()


	def get_patent_data(url):
	def get_meta(filter):
	el = soup.select_one(f'meta{filter}')
	if el is None:
	return ''
	return el.attrs['content']

	if not url.startswith('https://patents.google.com/patent/'):
	basename = os.path.basename(url)
	patent_num = os.path.splitext(basename)[0]

	patent_url = f'https://patents.google.com/patent/{patent_num}'
	else:
	patent_url = url

	logging.info('Final url: %s', patent_url)

	html = s.get(patent_url).content
	soup = bs4.BeautifulSoup(html, 'html.parser')

	title = get_meta('[name="DC.title"]')
	date_issued = get_meta('[name="DC.date"][scheme="issue"]').split('-')[0]
	date_submitted = get_meta('[name="DC.date"][scheme="dateSubmitted"]').split('-')[0]
	assignee = get_meta('[name="DC.contributor"][scheme="assignee"]')
	patent_num = get_meta('[name="citation_patent_number"]').replace(':', '')

	patent_data = dict(
	title=title,
	date_issued=date_issued,
	date_submitted=date_submitted,
	assignee=assignee,
	url=patent_url,
	patent_num=patent_num
	)

	return patent_data


	if __name__ == '__main__':
	url = 'https://patents.google.com/patent/US3798032'

	patent_data = get_patent_data(url)

	tmpdir = tempfile.gettempdir()
	html_path = Path(tmpdir) / 'patent.html'

	with open(html_path, 'w') as tmpf:
	html_out = HTML_TPL.format(**patent_data)
	tmpf.write(html_out)

	logging.info('Output path: %s', html_path)

	webbrowser.open(html_path)