Skip to content

Instantly share code, notes, and snippets.

@bdargan
Last active April 11, 2017 11:30
Show Gist options
  • Save bdargan/4951012 to your computer and use it in GitHub Desktop.
Save bdargan/4951012 to your computer and use it in GitHub Desktop.
load maxmind geocitylite db into elasticsearch
#! /usr/bin/env python
import json
import datetime
import httplib2
import re
import csv
import sys
http = httplib2.Http()
hdr_json = {'Content-type': 'application/json'}
def es_inject(doc_id, doc):
url = 'http://localhost:9200/%s' % doc_id
dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
response, content = http.request(url, 'PUT', headers=hdr_json, body=json.dumps(doc, default=dthandler))
print doc_id, response
def es_search(index, type, name, lat, lon):
url = 'http://localhost:9200/%s/%s' % (index,type)
query = {"query":{"bool":{"must":[{"query_string":{"default_field":"name","query":name}}, {"query_string":{"default_field":"lat","query":lat}},{"query_string":{"default_field":"lon","query":lon}}],"must_not":[],"should":[]}},"from":0,"size":500,"sort":[],"facets":{}}
# dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
response, content = http.request(url + "/_search", 'POST', headers=hdr_json, body=json.dumps(query))
#print response, content
return content
def es_delete(index,doctype,id=None):
if id:
url = "http://localhost:9200/%s/%s/%s" % (index, doctype,id)
else:
url = "http://localhost:9200/%s/%s" % (index, doctype)
response = http.request(url, "DELETE", headers=hdr_json)
print response
def unicode_csv_reader(utf8_data, delimiter=',',quotechar='"', **kwargs):
csv_reader = csv.reader(utf8_data, delimiter=',',quotechar='"', **kwargs)
for row in csv_reader:
try:
yield [unicode(cell, 'utf-8') for cell in row]
except UnicodeDecodeError:
decodes = [decode_heuristically(cell) for cell in row]
print decodes
yield [e[0] for e in decodes]
def decode_heuristically(string, enc = None, denc = sys.getdefaultencoding()):
"""
Try to interpret 'string' using several possible encodings.
@input : string, encode type.
@output: a list [decoded_string, flag_decoded, encoding]
"""
if isinstance(string, unicode): return string, 0, "utf-8"
try:
new_string = unicode(string, "ascii")
return string, 0, "ascii"
except UnicodeError:
encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"]
if denc != "ascii": encodings.insert(0, denc)
if enc: encodings.insert(0, enc)
for enc in encodings:
if (enc in ("iso-8859-15", "iso-8859-1") and
re.search(r"[\x80-\x9f]", string) is not None):
continue
if (enc in ("iso-8859-1", "cp1252") and
re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", string)\
is not None):
continue
try:
new_string = unicode(string, enc)
except UnicodeError:
pass
else:
if new_string.encode(enc) == string:
return new_string, 0, enc
# If unable to decode,doing force decoding i.e.neglecting those chars.
output = [(unicode(string, enc, "ignore"), enc) for enc in encodings]
output = [(len(new_string[0]), new_string) for new_string in output]
output.sort()
new_string, enc = output[-1][1]
return new_string, 1, enc
def es_create_index_and_mapping(index,type):
response = http.request("http://localhost:9200/%s" % index, "POST", headers=hdr_json)
print response
response = http.request("http://localhost:9200/%s/%s/_mapping" %(index,type), "POST", headers=hdr_json, body="""{
"loc": {
"properties": {
"area_code": {
"type": "string"
},
"city": {
"type": "string"
},
"country_code": {
"type": "string"
},
"loc_id": {
"type": "string"
},
"location": {
"type": "geo_point",
"lat_lon": "true"
},
"metro_code": {
"type": "string"
},
"name": {
"type": "string"
},
"postal_code": {
"type": "string"
},
"region": {
"type": "string"
}
}
}
}
""")
print response
if __name__ == "__main__":
# reader = unicode_csv_reader(open("PaxISbsp_airport_all.csv"), delimiter=',',quotechar='"')
# header = reader.next()
# es_delete("geocity","ports")
#es_create_index_and_mapping("geocity","loc")
# for row in reader:
# print row
# es_delete("geocity","loc",row[0])
# doc={"code":row[0],"name":row[1],"city":row[2],"state":row[3],"state_name":row[4], "location":{"lat":row[5],"lon":row[6]}, "country_code":row[7],"country_name":row[8],"global_region":row[9]}
# es_inject("%s/%s/%s" % ("geocity","ports",row[0]), doc)
reader = unicode_csv_reader(open("GeoLiteCity-Location.csv"), delimiter=',',quotechar='"')
header = reader.next()
header = reader.next()
index = "geocity2"
doctype = "loc"
es_delete(index,doctype)
# es_create_index_and_mapping("geocity","loc")
# locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode
for row in reader:
if len(row[3]) > 0:
resp = es_search(index,"loc", row[3], row[5],row[6])
#print json.loads(resp)['hits']['total']
if json.loads(resp)['hits']['total'] > 0:
# there is an entry for the location name and geo-coordinate already, skipping
print "skipping a duplicate name and geo-coordinate %s,%s,%s:" % (row[3], row[5], row[6])
else:
doc={"loc_id":row[0],"country_code":row[1],"region":row[2],"name":row[3],"city":row[3],"postal_code":row[4],"location":{"lat":row[5],"lon":row[6]}, "metro_code":row[7], "area_code":row[8]}
es_inject("%s/%s/%s" % (index,doctype,row[0]), doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment