Skip to content

Instantly share code, notes, and snippets.

@alexsisu
Created March 19, 2020 14:55
Show Gist options
  • Save alexsisu/0eb74f0298f55f64046acbe56833a4ae to your computer and use it in GitHub Desktop.
Save alexsisu/0eb74f0298f55f64046acbe56833a4ae to your computer and use it in GitHub Desktop.
mport os
import sys
import psutil
import pyarrow.parquet as pq
import time
import cudf
import subprocess
from subprocess import Popen, PIPE
from blazingsql import BlazingContext
def retrieve_cudf_memory():
process = Popen(["/usr/bin/nvidia-smi", "-q"], stdout=PIPE)
(output, err) = process.communicate()
exit_code = process.wait()
memory_used = 0
memory_free = 0
fb_memory_usage = "FB Memory Usage"
all_lines = str(output).split("\\n")
for i in range(0, len(all_lines)):
line = all_lines[i]
if fb_memory_usage in line.strip():
memory_used = all_lines[i + 2]
memory_free = all_lines[i + 3]
return memory_used.strip(), memory_free.strip()
def retrieve_parquet_files_from_folder(folder_name):
target_files = []
for entry in os.listdir(folder_name):
if entry.endswith("parquet"):
target_file = os.path.join(folder_name, entry)
target_files.append(target_file)
return target_files
bc = BlazingContext(allocator="managed",
pool=False,
initial_pool_size=None,
enable_logging=False)
target_folder2 = "/mnt/conduit_parquet_store/file_conduitbugbash_books15mil_snappy_parquet/"
parquet_files = retrieve_parquet_files_from_folder(target_folder2)
process = psutil.Process(os.getpid())
rss_start = process.memory_info().rss
bc.create_table("books15mils", parquet_files)
memory_used, memory_free = retrieve_cudf_memory()
print(f"[GPU] {memory_used} [GPU] {memory_free} ")
queries = [("warmup", "select count(*) from `file_conduitbugbash`.`books15mil` limit 1"),
("count1", "select count(*) from `file_conduitbugbash`.`books15mil`"),
("count2", "select count(*) from `file_conduitbugbash`.`books15mil`"),
("agg1",
"select a.`Year-Of-Publication`,count(a.`Publisher`) from `file_conduitbugbash`.`books15mil` a group by a.`Year-Of-Publication`"),
("agg2",
"select a.`Year-Of-Publication`,count(ISBN) from `file_conduitbugbash`.`books15mil` a group by a.`Year-Of-Publication` limit 10"),
("agg3",
"select a.`Year-Of-Publication`,count(a.`Book-Author`) from `file_conduitbugbash`.`books15mil` a group by a.`Year-Of-Publication` limit 10"),
("agg4",
"select a.`Book-Author`,count(a.`Year-Of-Publication`) from `file_conduitbugbash`.`books15mil` a group by a.`Book-Author` limit 10"),
("agg5",
"select a.`Book-Author`,a.`Year-Of-Publication`,count(a.`Publisher`) from `file_conduitbugbash`.`books15mil` a group by a.`Book-Author`,a.`Year-Of-Publication` limit 10"),
("q1",
"select a.`ISBN`, a.`Book-Title`, a.`Book-Author`, a.`Year-Of-Publication`, a.`Publisher`, a.`Image-URL-S`, a.`Image-URL-M`, a.`Image-URL-L` FROM `file_conduitbugbash`.`books15mil` a limit 1000"),
("q2", "select count(a.`ISBN`) FROM `file_conduitbugbash`.`books15mil` a"),
("q3",
"select a.`Publisher`, COUNT(a.`ISBN`) FROM `file_conduitbugbash`.`books15mil` a GROUP BY a.`Publisher` limit 1001"),
("q4",
"select a.`Book-Author`, a.`Publisher`, COUNT(a.`ISBN`) FROM `file_conduitbugbash`.`books15mil` a GROUP BY a.`Book-Author`, a.`Publisher` limit 1001"),
("q5",
"select a.`Book-Title`, COUNT(a.`ISBN`) FROM `file_conduitbugbash`.`books15mil` a GROUP BY a.`Book-Title` limit 1001"),
("q6",
"select a.`Book-Title`, a.`Year-Of-Publication`, COUNT(a.`ISBN`) FROM `file_conduitbugbash`.`books15mil` a GROUP BY a.`Book-Title`, a.`Year-Of-Publication` limit 1001"),
("q7",
"select C_43 AS C_1, C_0 AS C_3, C_4331 AS C_2 FROM (select a.`Book-Author` AS C_43, a.`Publisher` AS C_0, COUNT(a.`ISBN`) AS C_4331 FROM `file_conduitbugbash`.`books15mil` a GROUP BY a.`Book-Author`, a.`Publisher` ) C_4954424c WHERE ((CASE WHEN (C_4331 IS NULL ) THEN 1 ELSE CASE WHEN (C_4331 < 340) THEN 1 ELSE 0 END END = 1) OR (CASE WHEN (C_4331 IS NULL ) THEN 0 ELSE CASE WHEN (C_4331 = 340) THEN 1 ELSE 0 END END = 1) AND ((C_43 >= 'Dick King-Smith') AND (C_43 IS NOT NULL ) AND (C_0 IS NOT NULL ) AND (C_0 = 'Walker Books') OR (C_0 IS NOT NULL ) AND (C_0 > 'Walker Books'))) LIMIT 1001"),
("f1",
"SELECT C_43 AS C_8, C_5 AS C_7, COUNT(C_0) AS C_4331 FROM (SELECT C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`ISBN` AS C_0, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Book-Title` AS C_1, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Book-Author` AS C_43, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Year-Of-Publication` AS C_5, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Publisher` AS C_2, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Image-URL-S` AS C_3, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Image-URL-M` AS C_4, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Image-URL-L` AS C_6 FROM `file_conduitbugbash`.`books15mil` C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c WHERE C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Year-Of-Publication` IN ('2003', '2004', '2005', '2000', '2001', '2002') ) C_4954424c GROUP BY C_43, C_5 LIMIT 1001"),
("f2",
"SELECT C_2 AS C_8, C_1 AS C_7, COUNT(C_4) AS C_4331 FROM (SELECT C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`ISBN` AS C_4, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Book-Title` AS C_5, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Book-Author` AS C_2, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Year-Of-Publication` AS C_3, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Publisher` AS C_1, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Image-URL-S` AS C_6, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Image-URL-M` AS C_43, C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Image-URL-L` AS C_0 FROM `file_conduitbugbash`.`books15mil` C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c WHERE C_66696c655f636f6e64756974627567626173685f626f6f6b7331356d696c.`Year-Of-Publication` IN ('2003', '2004', '2005', '2000', '2001', '2002') ) C_4954424c GROUP BY C_2, C_1 LIMIT 1001")
]
all_query_perf = []
for query_id, query in queries:
query_to_execute = query.replace('`file_conduitbugbash`.`books15mil`', "books15mils")
start_time = time.time()
bc.sql(query_to_execute)
end_time = time.time()
memory_used, memory_free = retrieve_cudf_memory()
print(f"[GPU] {memory_used} [GPU] {memory_free} ")
print(query, ",", str(int((end_time - start_time) * 1000)) + "ms")
all_query_perf.append((query_id, str(int((end_time - start_time) * 1000)) + "ms"))
for entry in all_query_perf:
print(entry[0], ",", entry[1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment