This is a script to convert the csv of crawling at home to parquet
See the result at https://colab.research.google.com/drive/14Hc_fUUOrG9260VzD_XsTxWX7f5cptyL?usp=sharing
This is a script to convert the csv of crawling at home to parquet
See the result at https://colab.research.google.com/drive/14Hc_fUUOrG9260VzD_XsTxWX7f5cptyL?usp=sharing
''' | |
Compute some stats on cah collection | |
First get the files with: | |
https://gist.github.com/rom1504/f427b1c82df26c9993daa36fca7f9881 | |
Then pip install pyspark | |
Then run this file. It also takes a few minutes | |
The main thing this script is doing is adding/removing/reordering csv columns and converting to fewer parquet files | |
The end result is easy to use in spark, pandas or anything else | |
''' | |
from glob import glob | |
from multiprocessing import Pool | |
from collections import defaultdict | |
from pathlib import Path | |
def f(w): | |
return open(w, "r").readline().rstrip().split("|") | |
def main(): | |
p = Pool(128) | |
# necessary because the schema changed | |
print("Retrieving columns of all csv files") | |
fs = [str(x) for x in Path('/media/hd/cah/drive').glob("**/*.csv")] + [str(x) for x in Path('/media/hd/cah/theeye/output/cah').glob("**/*.csv")] | |
headers = p.map(f, fs) | |
all = list(zip(headers,fs)) | |
print("Grouping files by columns") | |
d = defaultdict(list) | |
for cols, path in all: | |
d[",".join(cols)].append(path) | |
print("Starting spark session") | |
from pyspark.sql import SparkSession | |
from pyspark.sql.functions import lit | |
# You can open http://localhost:4040 to follow progress on the spark operations | |
spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate() | |
ref_cols = ['SAMPLE_ID','URL','TEXT','HEIGHT','WIDTH','LICENSE','NSFW','similarity'] | |
total = None | |
print("Reading all collections of csv, removing, adding and reordering columns as needed") | |
for cols, paths in d.items(): | |
cols = cols.split(",") | |
incols = [x for x in cols if x in ref_cols] | |
print("incols", incols) | |
w = spark.read.options(delimiter="|", header=True).csv(paths).select(*incols) | |
addcols = [x for x in ref_cols if x not in cols] | |
print("addcols", addcols) | |
for c in addcols: | |
w = w.withColumn(c, lit("")) | |
w = w.select(*ref_cols) | |
if total is None: | |
total = w | |
else: | |
total = total.union(w) | |
print("Casting columns to the right types") | |
total = total.withColumn("SAMPLE_ID", total["SAMPLE_ID"].cast("bigint")) | |
total = total.withColumn("WIDTH", total["WIDTH"].cast("int")) | |
total = total.withColumn("HEIGHT", total["HEIGHT"].cast("int")) | |
total = total.withColumn("similarity", total["similarity"].cast("double")) | |
print("Repartitionning and writing to 16 parquet files to cah_dataframe") | |
total.repartition(16).write.mode("overwrite").parquet("cah_dataframe") | |
ok = spark.read.parquet("cah_dataframe") | |
print("Rereading the parquet and computing some basic stats") | |
print("Size of collection", ok.count()) | |
uniques = ok.drop_duplicates(["URL", "TEXT"]) | |
uniques.repartition(16).write.mode("overwrite").parquet("cah_dataframe_unique") | |
ok_unique = spark.read.parquet("cah_dataframe_unique") | |
print("Number of uniques", ok_unique.count()) | |
main() |
''' | |
Once you computed the parquet files with unique items, | |
let's compute more stats | |
''' | |
from pyspark.sql import SparkSession | |
import pyspark.sql.functions as F | |
def main(): | |
spark = SparkSession.builder.config("spark.driver.memory", "16G") .master("local[16]").appName('spark-stats').getOrCreate() | |
df = spark.read.parquet("cah_dataframe_unique") | |
df.printSchema() | |
df.show(truncate=False) | |
print("width quantiles", df.approxQuantile("WIDTH", [0.1*x for x in range(1,10)], 0.1)) | |
print("height quantiles", df.approxQuantile("HEIGHT", [0.1*x for x in range(1,10)], 0.1)) | |
print("similarity quantiles", df.approxQuantile("similarity", [0.1*x for x in range(1,10)], 0.1)) | |
df = df.withColumn("lentext", F.length("TEXT")) | |
print("text length quantiles", df.approxQuantile("lentext", [0.1*x for x in range(1,10)], 0.1)) | |
print("Number of uniques", df.count()) | |
main() |
root | |
|-- SAMPLE_ID: long (nullable = true) |-- URL: string (nullable = true) | |
|-- TEXT: string (nullable = true) | |
|-- HEIGHT: integer (nullable = true) | |
|-- WIDTH: integer (nullable = true) |-- LICENSE: string (nullable = true) | |
|-- NSFW: string (nullable = true) |-- similarity: double (nullable = true) | |
+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------- | |
---------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
|SAMPLE_ID |URL |TEXT | |
|HEIGHT|WIDTH|LICENSE|NSFW |similarity | | |
+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------- | |
---------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
---------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
|41826002453|http://cdn-s3-3.wanelo.com/product/image/5753773/x200.jpg |Hoop Earrings Beaded Hoop Earrings |200 |200 |? |UNLIKELY|0.3015734851360321 | | |
|11286064458|http://images.knetbooks.com/images/d/7/232/9780321087232.jpg |Practical Guide to Secondary Social Studies, A |187 |187 |? |UNLIKELY|0.3707329034805298 | | |
|15923025895|http://static8.depositphotos.com/1231854/999/i/450/dep_9991553-Young-glamorous-blonde-with-shopping-bag-holding-toy-terrier-dogs-holding-dog.jpg|Young glamorous blonde with shopping bag holding toy terrier dogs holding dog — Stock Photo #9991553|398 |600 |? |UNLIKELY|0.3059745728969574 | | |
|10787043769|https://www.handcraftedmodelships.com/pictures/main/lighthouse-decor-beach-decorations23.jpg |Wooden White Net and Rope Lighthouse 15 |400 |300 |? |UNLIKELY|0.3204681873321533 | | |
|30762015899|http://media.rightmove.co.uk/11k/10211/41348840/10211_311626A_11626_IMG_16_0000_max_135x100.JPG |3 bed Cottage for sale in Berkeley, Gloucestershire |135 |90 |? |UNLIKELY|0.31845608353614807| | |
|33202002260|http://g2.img-dpreview.com/EE6A97AE86E14336BE9D29331FA5737B.jpg |Hang Glide over Cowichan |120 |90 |? |UNLIKELY|0.32427337765693665| | |
|42981003590|http://cdn-s3-3.wanelo.com/product/image/3939089/x200.jpg |ANY Size ANY Colorway x High-Waisted Aztec Frayed Denim Shorts |200 |200 |? |UNLIKELY|0.3527474105358124 | | |
|17772022270|http://images3.chictopia.com/photos/Maddinka/7896011255/yellow-wholesale-dress-bag-blue-stradivarius-top-cream-mango-skirt.jpg |yellow Wholesale-Dress bag - blue Stradivarius top - cream Mango skirt |300 |450 |? |UNLIKELY|0.3499804437160492 | | |
|27789015457|http://static6.depositphotos.com/1036080/659/i/110/depositphotos_6598667-Couple-of-amazing-black-dobermans.jpg |Couple of amazing black dobermans - Foto de Stock |110 |110 |? |UNLIKELY|0.3847452998161316 | | |
|34655009450|http://cdn1.image.tube8.phncdn.com/201103/07/706841/190x143/2.jpg |Army girl gets fucked by ... |190 |143 |? |UNLIKELY|0.3381859362125397 | | |
|10112032340|http://cdn-s3-3.wanelo.com/product/image/1656336/original.jpg |Elegance Shawl / Scarf with Lacy Edge - leopard- |570 |715 |? |UNLIKELY|0.34083840250968933| | |
|23205009139|http://i0.wp.com/hypebeast.com/image/2012/09/mastermind-japan-carhartt-2012-fall-winter-capsule-collection-1.jpg?w=570 |Image of mastermind JAPAN x Carhartt 2012 Fall/Winter Capsule Collection |570 |854 |? |UNLIKELY|0.3161979019641876 | | |
|8161023068 |http://d2d00szk9na1qq.cloudfront.net/Product/36c0458d-8d82-4b0b-b1cb-b880acee1c3d/Images/List_UO-507.jpg |Magnolia Home Fashions Oxford Stripe Charcoal |150 |150 |? |UNLIKELY|0.3259333670139313 | | |
|4838750426 |http://www.toggle.co.nz/media/catalog/product/cache/1/thumbnail/130x/9df78eab33525d08d6e5fb8d27136e95/k/e/keepcalm_blue.jpg |"Keep Calm" - Blue Canvas |130 |130 | |UNLIKELY|0.33144283294677734| | |
|3469000780 |http://media.bdaily.s3.amazonaws.com/images/avatars/large/10682.jpg |Thomas Eggar |82 |82 | |UNLIKELY|0.35470208525657654| | |
|12408032134|http://rlv.zcache.co.uk/ornate_formal_black_white_damask_custom_tie-rfe02c92ee866464db7d5b7c97571d0b5_v9whb_8byvr_216.jpg |ornate formal black white damask custom tie |216 |216 |? |UNLIKELY|0.3525067865848541 | | |
|24984002927|http://cdn2.newsok.biz/cache/w640-6f653feb73c40138b825518939ac3557.jpg |The old Iten Biscuit Co. is now a U-Haul center. THE OKLAHOMAN ARCHIVES |640 |511 |? |UNLIKELY|0.3311549127101898 | | |
|12525060020|http://s7d4.scene7.com/is/image/Belk?layer=0&src=1802968_202178317001_A_001_T10L00&layer=comp&$P_PROD$ |Lauren Ralph Lauren Plus Size Drawstring Cotton Cropped Pant |233 |338 |? |UNLIKELY|0.3262045085430145 | | |
|12231074283|https://www.sportstadion.nl/media/catalog/category/newcastle-united.jpg |Newcastle United |698 |313 |? |UNLIKELY|0.3026922047138214 | | |
|42445000967|http://media3.onsugar.com/files/upl1/10/104166/39_2008/catgenie.xlarge/i/Upgrade-Your-CatGenie-Power-Flush-System-Free.jpg |Upgrade Your CatGenie with the Power Flush System for Free! |320 |150 |? |UNLIKELY|0.3295552730560303 | | |
+-----------+------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------+------+-----+-------+--------+-------------------+ | |
only showing top 20 rows | |
width quantiles [0.0, 120.0, 151.0, 180.0, 215.0, 270.0, 273.0, 370.0, 39580.0] | |
height quantiles [0.0, 128.0, 160.0, 184.0, 216.0, 250.0, 300.0, 446.0, 18849.0] | |
similarity quantiles [0.0, 0.3069250285625458, 0.3145156800746918, 0.32135993242263794, 0.32168570160865784, 0.32813096046447754, 0.339599609375, 0.35535627603530884, 6016.0] | |
text length quantiles [1.0, 25.0, 33.0, 39.0, 45.0, 50.0, 56.0, 73.0, 61192.0] |