This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| from hashlib import md5 | |
| def text_to_hash(text): | |
| return md5(text.encode("utf8")).hexdigest() | |
| def add_hash(column_name="document"): | |
| """ | |
| Decorator. Wraps a function that returns a dataframe, must have column_name in columns. | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # checked against python 3.7.3, pandas 0.24.2, s3fs 0.4.2 | |
| import tarfile | |
| import io | |
| import s3fs | |
| import pandas as pd | |
| tar_path = f"s3://my-bucket/debug.tar.gz" # path in s3 | |
| metadata_path = "debug/metadata.csv" # path inside of the tar file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import feedparser # via conda install anaconda::feedparser | |
| import yaml | |
| from bs4 import BeautifulSoup | |
| _GOODREADS_RSS_STREAM_URL = "https://www.goodreads.com/review/list_rss/<XXXXXXXXXX>?key=<XXXXXXXXXXXXXX>&shelf=<XXXX>" | |
| # Old yaml lives here. We'll use it to ensure our new dump has unique values. | |
| _EXISTING_YAML_PATH = "docs/_data/books.yml" | |
| _NEW_YAML_PATH = "books.yaml" | |
OlderNewer