Created
September 1, 2017 13:59
-
-
Save narulkargunjan/ab8d3b4905cb131e7613cd790b5e298d to your computer and use it in GitHub Desktop.
Sample HappyBase Sample for accessing HBase using Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import happybase | |
import time | |
batch_size = 1000 | |
host = "0.0.0.0" | |
file_path = "Request_for_Information_Cases.csv" | |
namespace = "sample_data" | |
row_count = 0 | |
start_time = time.time() | |
table_name = "rfic" | |
def connect_to_hbase(): | |
""" Connect to HBase server. | |
This will use the host, namespace, table name, and batch size as defined in | |
the global variables above. | |
""" | |
conn = happybase.Connection(host = host, | |
table_prefix = namespace, | |
table_prefix_separator = ":") | |
conn.open() | |
table = conn.table(table_name) | |
batch = table.batch(batch_size = batch_size) | |
return conn, batch | |
def insert_row(batch, row): | |
""" Insert a row into HBase. | |
Write the row to the batch. When the batch size is reached, rows will be | |
sent to the database. | |
Rows have the following schema: | |
[ id, keyword, subcategory, type, township, city, zip, council_district, | |
opened, closed, status, origin, location ] | |
""" | |
batch.put(row[0], { "data:kw": row[1], "data:sub": row[2], "data:type": row[3], | |
"data:town": row[4], "data:city": row[5], "data:zip": row[6], | |
"data:cdist": row[7], "data:open": row[8], "data:close": row[9], | |
"data:status": row[10], "data:origin": row[11], "data:loc": row[12] }) | |
def read_csv(): | |
csvfile = open(file_path, "r") | |
csvreader = csv.reader(csvfile) | |
return csvreader, csvfile | |
# After everything has been defined, run the script. | |
conn, batch = connect_to_hbase() | |
print "Connect to HBase. table name: %s, batch size: %i" % (table_name, batch_size) | |
csvreader, csvfile = read_csv() | |
print "Connected to file. name: %s" % (file_path) | |
try: | |
# Loop through the rows. The first row contains column headers, so skip that | |
# row. Insert all remaining rows into the database. | |
for row in csvreader: | |
row_count += 1 | |
if row_count == 1: | |
pass | |
else: | |
insert_row(batch, row) | |
# If there are any leftover rows in the batch, send them now. | |
batch.send() | |
finally: | |
# No matter what happens, close the file handle. | |
csvfile.close() | |
conn.close() | |
duration = time.time() - start_time | |
print "Done. row count: %i, duration: %.3f s" % (row_count, duration) | |
# Source: https://gist.github.com/jarrettmeyer/26b3e1fcd423071a7a6d |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment