Created
June 15, 2022 20:01
-
-
Save D4Vinci/3fb2adfac84d9d2fe46d4cbff7fc1e80 to your computer and use it in GitHub Desktop.
Using pandas to convert `.json-lines` file to `.csv` fast but using more memory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
# | |
## Settings ## | |
input_file = "13441370_meta.json-lines" # Have to be a .json-lines files | |
output_file = "All_reviews.csv" # Have to be a .csv file | |
new_columns = ( # The new file columns | |
'product_name', | |
'product_brand', | |
'product_category', | |
'product_code', | |
'product_url', | |
'product_pros', | |
'product_cons', | |
'average_build_quality', | |
'average_ease_of_use', | |
'average_features', | |
'average_rating', | |
'average_value_for_money', | |
'build_quality_score', | |
'ease_of_use_score', | |
'features_score', | |
'number_of_reviews', | |
'purchase_date', | |
'review_date', | |
'review_score', | |
'reviewer_location', | |
'reviewer_name', | |
'value_for_money_score', | |
) | |
## | |
# | |
print(f"[+] Reading input file {input_file}...") | |
df = pandas.read_json(input_file, lines=True) | |
print(f"[+] Found {len(df)} lines") | |
column_number = 0 | |
new_df = pandas.DataFrame([], columns=new_columns) | |
print("[+] Starting...") | |
# Here we map columns with different names | |
for new_column, old_column in [("product_name", "name",), ("product_code", "identifier",), ("product_brand", "brand",), ("product_url", "url",)]: | |
new_df[new_column] = df.apply(lambda row: row[old_column], axis=1) | |
column_number += 1 | |
print(f"[~] Column {column_number}/{len(new_columns)} done\r", end="", flush=True) | |
# Here's a column that needs something to be done on the value | |
new_df["product_category"] = df.apply(lambda row: ">".join(row["category"]), axis=1) | |
column_number += 1 | |
print(f"[~] Column {column_number}/{len(new_columns)} done\r", end="", flush=True) | |
# Maping columns from metadata with some checks on them | |
for col in ("average_rating", "number_of_reviews", "average_ease_of_use", "average_features", "average_build_quality", "average_value_for_money", "review_date", "purchase_date", "review_score", "reviewer_name", "reviewer_location", "product_pros", "product_cons", "ease_of_use_score", "features_score", "build_quality_score", "value_for_money_score",): | |
new_df[col] = df.apply( lambda row: row["metadata"][col] if row["metadata"][col] and row["metadata"][col] not in ('None', "none") else "", axis=1) | |
column_number += 1 | |
print(f"[~] Column {column_number}/{len(new_columns)} done\r", end="", flush=True) | |
# | |
with open(output_file, 'w', encoding="utf-8") as f: | |
new_df.to_csv(f, encoding='utf-8', chunksize=100000, index=False, header=True, line_terminator='\n', escapechar='\\') | |
print(f"\n[+] Saved new file to {output_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
To use less memory or to run the script on a low-spec machine you will need to read the input file in chunks and write it in chunks but it will take a lot more time, depending on the size of the chunk. I can create that version if anyone wants it.