Last active
April 11, 2017 11:30
-
-
Save bdargan/4951012 to your computer and use it in GitHub Desktop.
load maxmind geocitylite db into elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import json | |
import datetime | |
import httplib2 | |
import re | |
import csv | |
import sys | |
http = httplib2.Http() | |
hdr_json = {'Content-type': 'application/json'} | |
def es_inject(doc_id, doc): | |
url = 'http://localhost:9200/%s' % doc_id | |
dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None | |
response, content = http.request(url, 'PUT', headers=hdr_json, body=json.dumps(doc, default=dthandler)) | |
print doc_id, response | |
def es_search(index, type, name, lat, lon): | |
url = 'http://localhost:9200/%s/%s' % (index,type) | |
query = {"query":{"bool":{"must":[{"query_string":{"default_field":"name","query":name}}, {"query_string":{"default_field":"lat","query":lat}},{"query_string":{"default_field":"lon","query":lon}}],"must_not":[],"should":[]}},"from":0,"size":500,"sort":[],"facets":{}} | |
# dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None | |
response, content = http.request(url + "/_search", 'POST', headers=hdr_json, body=json.dumps(query)) | |
#print response, content | |
return content | |
def es_delete(index,doctype,id=None): | |
if id: | |
url = "http://localhost:9200/%s/%s/%s" % (index, doctype,id) | |
else: | |
url = "http://localhost:9200/%s/%s" % (index, doctype) | |
response = http.request(url, "DELETE", headers=hdr_json) | |
print response | |
def unicode_csv_reader(utf8_data, delimiter=',',quotechar='"', **kwargs): | |
csv_reader = csv.reader(utf8_data, delimiter=',',quotechar='"', **kwargs) | |
for row in csv_reader: | |
try: | |
yield [unicode(cell, 'utf-8') for cell in row] | |
except UnicodeDecodeError: | |
decodes = [decode_heuristically(cell) for cell in row] | |
print decodes | |
yield [e[0] for e in decodes] | |
def decode_heuristically(string, enc = None, denc = sys.getdefaultencoding()): | |
""" | |
Try to interpret 'string' using several possible encodings. | |
@input : string, encode type. | |
@output: a list [decoded_string, flag_decoded, encoding] | |
""" | |
if isinstance(string, unicode): return string, 0, "utf-8" | |
try: | |
new_string = unicode(string, "ascii") | |
return string, 0, "ascii" | |
except UnicodeError: | |
encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"] | |
if denc != "ascii": encodings.insert(0, denc) | |
if enc: encodings.insert(0, enc) | |
for enc in encodings: | |
if (enc in ("iso-8859-15", "iso-8859-1") and | |
re.search(r"[\x80-\x9f]", string) is not None): | |
continue | |
if (enc in ("iso-8859-1", "cp1252") and | |
re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", string)\ | |
is not None): | |
continue | |
try: | |
new_string = unicode(string, enc) | |
except UnicodeError: | |
pass | |
else: | |
if new_string.encode(enc) == string: | |
return new_string, 0, enc | |
# If unable to decode,doing force decoding i.e.neglecting those chars. | |
output = [(unicode(string, enc, "ignore"), enc) for enc in encodings] | |
output = [(len(new_string[0]), new_string) for new_string in output] | |
output.sort() | |
new_string, enc = output[-1][1] | |
return new_string, 1, enc | |
def es_create_index_and_mapping(index,type): | |
response = http.request("http://localhost:9200/%s" % index, "POST", headers=hdr_json) | |
print response | |
response = http.request("http://localhost:9200/%s/%s/_mapping" %(index,type), "POST", headers=hdr_json, body="""{ | |
"loc": { | |
"properties": { | |
"area_code": { | |
"type": "string" | |
}, | |
"city": { | |
"type": "string" | |
}, | |
"country_code": { | |
"type": "string" | |
}, | |
"loc_id": { | |
"type": "string" | |
}, | |
"location": { | |
"type": "geo_point", | |
"lat_lon": "true" | |
}, | |
"metro_code": { | |
"type": "string" | |
}, | |
"name": { | |
"type": "string" | |
}, | |
"postal_code": { | |
"type": "string" | |
}, | |
"region": { | |
"type": "string" | |
} | |
} | |
} | |
} | |
""") | |
print response | |
if __name__ == "__main__": | |
# reader = unicode_csv_reader(open("PaxISbsp_airport_all.csv"), delimiter=',',quotechar='"') | |
# header = reader.next() | |
# es_delete("geocity","ports") | |
#es_create_index_and_mapping("geocity","loc") | |
# for row in reader: | |
# print row | |
# es_delete("geocity","loc",row[0]) | |
# doc={"code":row[0],"name":row[1],"city":row[2],"state":row[3],"state_name":row[4], "location":{"lat":row[5],"lon":row[6]}, "country_code":row[7],"country_name":row[8],"global_region":row[9]} | |
# es_inject("%s/%s/%s" % ("geocity","ports",row[0]), doc) | |
reader = unicode_csv_reader(open("GeoLiteCity-Location.csv"), delimiter=',',quotechar='"') | |
header = reader.next() | |
header = reader.next() | |
index = "geocity2" | |
doctype = "loc" | |
es_delete(index,doctype) | |
# es_create_index_and_mapping("geocity","loc") | |
# locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode | |
for row in reader: | |
if len(row[3]) > 0: | |
resp = es_search(index,"loc", row[3], row[5],row[6]) | |
#print json.loads(resp)['hits']['total'] | |
if json.loads(resp)['hits']['total'] > 0: | |
# there is an entry for the location name and geo-coordinate already, skipping | |
print "skipping a duplicate name and geo-coordinate %s,%s,%s:" % (row[3], row[5], row[6]) | |
else: | |
doc={"loc_id":row[0],"country_code":row[1],"region":row[2],"name":row[3],"city":row[3],"postal_code":row[4],"location":{"lat":row[5],"lon":row[6]}, "metro_code":row[7], "area_code":row[8]} | |
es_inject("%s/%s/%s" % (index,doctype,row[0]), doc) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment