bdargan · April 11, 2017 11:30
diff --git a/geocitylite_to_es.py b/geocitylite_to_es.py
 #! /usr/bin/env python
 import json
 import datetime
 import httplib2
 import re
 import csv
 import sys

 http = httplib2.Http()

 hdr_json = {'Content-type': 'application/json'}

 def es_inject(doc_id, doc):
  url = 'http://localhost:9200/%s' % doc_id 
  dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
  response, content = http.request(url, 'PUT', headers=hdr_json, body=json.dumps(doc, default=dthandler))
  print doc_id, response    

 def es_search(index, type, name, lat, lon):
  url = 'http://localhost:9200/%s/%s' % (index,type) 
  query = {"query":{"bool":{"must":[{"query_string":{"default_field":"name","query":name}}, {"query_string":{"default_field":"lat","query":lat}},{"query_string":{"default_field":"lon","query":lon}}],"must_not":[],"should":[]}},"from":0,"size":500,"sort":[],"facets":{}}
  # dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
  response, content = http.request(url + "/_search", 'POST', headers=hdr_json, body=json.dumps(query))
  #print response, content
  return content

 def es_delete(index,doctype,id=None):
  if id:
    url = "http://localhost:9200/%s/%s/%s" % (index, doctype,id)  
  else:
    url = "http://localhost:9200/%s/%s" % (index, doctype)
  response = http.request(url, "DELETE", headers=hdr_json)
  print response

 def unicode_csv_reader(utf8_data, delimiter=',',quotechar='"', **kwargs):
    csv_reader = csv.reader(utf8_data, delimiter=',',quotechar='"',  **kwargs)
    for row in csv_reader:
      try:
        yield [unicode(cell, 'utf-8') for cell in row]
      except UnicodeDecodeError:
        decodes = [decode_heuristically(cell) for cell in row]
        print decodes
        yield [e[0] for e in decodes]

 def decode_heuristically(string, enc = None, denc = sys.getdefaultencoding()):
    """
    Try to interpret 'string' using several possible encodings.
    @input : string, encode type.
    @output: a list [decoded_string, flag_decoded, encoding]
    """
    if isinstance(string, unicode): return string, 0, "utf-8"
    try:
        new_string = unicode(string, "ascii")
        return string, 0, "ascii"
    except UnicodeError:
        encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"]

        if denc != "ascii": encodings.insert(0, denc)

        if enc: encodings.insert(0, enc)

        for enc in encodings:
            if (enc in ("iso-8859-15", "iso-8859-1") and
                re.search(r"[\x80-\x9f]", string) is not None):
                continue

            if (enc in ("iso-8859-1", "cp1252") and
                re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", string)\
                is not None):
                continue

            try:
                new_string = unicode(string, enc)
            except UnicodeError:
                pass
            else:
                if new_string.encode(enc) == string:
                    return new_string, 0, enc

        # If unable to decode,doing force decoding i.e.neglecting those chars.
        output = [(unicode(string, enc, "ignore"), enc) for enc in encodings]
        output = [(len(new_string[0]), new_string) for new_string in output]
        output.sort()
        new_string, enc = output[-1][1]
        return new_string, 1, enc

 def es_create_index_and_mapping(index,type):
    response = http.request("http://localhost:9200/%s" % index, "POST", headers=hdr_json)
    print response
    response = http.request("http://localhost:9200/%s/%s/_mapping" %(index,type), "POST", headers=hdr_json, body="""{
    "loc": {
        "properties": {
            "area_code": {
                "type": "string"
            }, 
            "city": {
                "type": "string"
            }, 
            "country_code": {
                "type": "string"
            }, 
            "loc_id": {
                "type": "string"
            }, 
            "location": {
                "type": "geo_point", 
                "lat_lon": "true"                
            }, 
            "metro_code": {
                "type": "string"
            }, 
            "name": {
                "type": "string"
            }, 
            "postal_code": {
                "type": "string"
            }, 
            "region": {
                "type": "string"
            }
        }
    }
 }
 """)
    print response

 if __name__ == "__main__":
  # reader = unicode_csv_reader(open("PaxISbsp_airport_all.csv"), delimiter=',',quotechar='"')
  # header = reader.next()
  # es_delete("geocity","ports")

  #es_create_index_and_mapping("geocity","loc")
  # for row in reader:
    # print row
    # es_delete("geocity","loc",row[0])
    # doc={"code":row[0],"name":row[1],"city":row[2],"state":row[3],"state_name":row[4], "location":{"lat":row[5],"lon":row[6]}, "country_code":row[7],"country_name":row[8],"global_region":row[9]}
    # es_inject("%s/%s/%s" % ("geocity","ports",row[0]), doc)

  reader = unicode_csv_reader(open("GeoLiteCity-Location.csv"), delimiter=',',quotechar='"')
  header = reader.next()
  header = reader.next()

  index = "geocity2"
  doctype = "loc"
  es_delete(index,doctype)
  # es_create_index_and_mapping("geocity","loc")
  # locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode
  for row in reader:
    if len(row[3]) > 0:
      resp = es_search(index,"loc", row[3], row[5],row[6])
      #print json.loads(resp)['hits']['total']
      if json.loads(resp)['hits']['total'] > 0:
        # there is an entry for the location name and geo-coordinate already, skipping
        print "skipping a duplicate name and geo-coordinate %s,%s,%s:" % (row[3], row[5], row[6])
      else:  
        doc={"loc_id":row[0],"country_code":row[1],"region":row[2],"name":row[3],"city":row[3],"postal_code":row[4],"location":{"lat":row[5],"lon":row[6]}, "metro_code":row[7], "area_code":row[8]}
        es_inject("%s/%s/%s" % (index,doctype,row[0]), doc)
	#! /usr/bin/env python
	import json
	import datetime
	import httplib2
	import re
	import csv
	import sys

	http = httplib2.Http()

	hdr_json = {'Content-type': 'application/json'}

	def es_inject(doc_id, doc):
	url = 'http://localhost:9200/%s' % doc_id
	dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
	response, content = http.request(url, 'PUT', headers=hdr_json, body=json.dumps(doc, default=dthandler))
	print doc_id, response

	def es_search(index, type, name, lat, lon):
	url = 'http://localhost:9200/%s/%s' % (index,type)
	query = {"query":{"bool":{"must":[{"query_string":{"default_field":"name","query":name}}, {"query_string":{"default_field":"lat","query":lat}},{"query_string":{"default_field":"lon","query":lon}}],"must_not":[],"should":[]}},"from":0,"size":500,"sort":[],"facets":{}}
	# dthandler = lambda obj: obj.isoformat() if isinstance(obj, datetime.datetime) else None
	response, content = http.request(url + "/_search", 'POST', headers=hdr_json, body=json.dumps(query))
	#print response, content
	return content

	def es_delete(index,doctype,id=None):
	if id:
	url = "http://localhost:9200/%s/%s/%s" % (index, doctype,id)
	else:
	url = "http://localhost:9200/%s/%s" % (index, doctype)
	response = http.request(url, "DELETE", headers=hdr_json)
	print response

	def unicode_csv_reader(utf8_data, delimiter=',',quotechar='"', **kwargs):
	csv_reader = csv.reader(utf8_data, delimiter=',',quotechar='"', **kwargs)
	for row in csv_reader:
	try:
	yield [unicode(cell, 'utf-8') for cell in row]
	except UnicodeDecodeError:
	decodes = [decode_heuristically(cell) for cell in row]
	print decodes
	yield [e[0] for e in decodes]

	def decode_heuristically(string, enc = None, denc = sys.getdefaultencoding()):
	"""
	Try to interpret 'string' using several possible encodings.
	@input : string, encode type.
	@output: a list [decoded_string, flag_decoded, encoding]
	"""
	if isinstance(string, unicode): return string, 0, "utf-8"
	try:
	new_string = unicode(string, "ascii")
	return string, 0, "ascii"
	except UnicodeError:
	encodings = ["utf-8","iso-8859-1","cp1252","iso-8859-15"]

	if denc != "ascii": encodings.insert(0, denc)

	if enc: encodings.insert(0, enc)

	for enc in encodings:
	if (enc in ("iso-8859-15", "iso-8859-1") and
	re.search(r"[\x80-\x9f]", string) is not None):
	continue

	if (enc in ("iso-8859-1", "cp1252") and
	re.search(r"[\xa4\xa6\xa8\xb4\xb8\xbc-\xbe]", string)\
	is not None):
	continue

	try:
	new_string = unicode(string, enc)
	except UnicodeError:
	pass
	else:
	if new_string.encode(enc) == string:
	return new_string, 0, enc

	# If unable to decode,doing force decoding i.e.neglecting those chars.
	output = [(unicode(string, enc, "ignore"), enc) for enc in encodings]
	output = [(len(new_string[0]), new_string) for new_string in output]
	output.sort()
	new_string, enc = output[-1][1]
	return new_string, 1, enc

	def es_create_index_and_mapping(index,type):
	response = http.request("http://localhost:9200/%s" % index, "POST", headers=hdr_json)
	print response
	response = http.request("http://localhost:9200/%s/%s/_mapping" %(index,type), "POST", headers=hdr_json, body="""{
	"loc": {
	"properties": {
	"area_code": {
	"type": "string"
	},
	"city": {
	"type": "string"
	},
	"country_code": {
	"type": "string"
	},
	"loc_id": {
	"type": "string"
	},
	"location": {
	"type": "geo_point",
	"lat_lon": "true"
	},
	"metro_code": {
	"type": "string"
	},
	"name": {
	"type": "string"
	},
	"postal_code": {
	"type": "string"
	},
	"region": {
	"type": "string"
	}
	}
	}
	}
	""")
	print response

	if __name__ == "__main__":
	# reader = unicode_csv_reader(open("PaxISbsp_airport_all.csv"), delimiter=',',quotechar='"')
	# header = reader.next()
	# es_delete("geocity","ports")

	#es_create_index_and_mapping("geocity","loc")
	# for row in reader:
	# print row
	# es_delete("geocity","loc",row[0])
	# doc={"code":row[0],"name":row[1],"city":row[2],"state":row[3],"state_name":row[4], "location":{"lat":row[5],"lon":row[6]}, "country_code":row[7],"country_name":row[8],"global_region":row[9]}
	# es_inject("%s/%s/%s" % ("geocity","ports",row[0]), doc)

	reader = unicode_csv_reader(open("GeoLiteCity-Location.csv"), delimiter=',',quotechar='"')
	header = reader.next()
	header = reader.next()

	index = "geocity2"
	doctype = "loc"
	es_delete(index,doctype)
	# es_create_index_and_mapping("geocity","loc")
	# locId,country,region,city,postalCode,latitude,longitude,metroCode,areaCode
	for row in reader:
	if len(row[3]) > 0:
	resp = es_search(index,"loc", row[3], row[5],row[6])
	#print json.loads(resp)['hits']['total']
	if json.loads(resp)['hits']['total'] > 0:
	# there is an entry for the location name and geo-coordinate already, skipping
	print "skipping a duplicate name and geo-coordinate %s,%s,%s:" % (row[3], row[5], row[6])
	else:
	doc={"loc_id":row[0],"country_code":row[1],"region":row[2],"name":row[3],"city":row[3],"postal_code":row[4],"location":{"lat":row[5],"lon":row[6]}, "metro_code":row[7], "area_code":row[8]}
	es_inject("%s/%s/%s" % (index,doctype,row[0]), doc)