Skip to content

Instantly share code, notes, and snippets.

@trolleway
Created December 8, 2016 14:21
Show Gist options
  • Save trolleway/9f2c4cb40d0984b0f31e5004df856c1b to your computer and use it in GitHub Desktop.
Save trolleway/9f2c4cb40d0984b0f31e5004df856c1b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# parse each file from the photo collection and export data into CSV.
# will need: os.listdir(path)
#http://robertcarlsen.net/2010/02/23/parsing-foursquare-kml-files-1113
import sys
import os
import codecs
import csv
from bs4 import BeautifulSoup
# get the file list:
if len(sys.argv) > 1:
dir = sys.argv[1]
else:
dir = os.getcwd()
file = dir
# create the output dictionary
outputData = []
# sanity checking, only work on kml files
if file.endswith('.kml') == 0: sys.exit(-1)
print "Reading file: "+file
fh = codecs.open(file,'r',"utf-8")
html = fh.read()
fh.close()
soup = BeautifulSoup(html, 'lxml')
#print soup.prettify()
# create a new dictionary for the current image's data
imageData = dict();
outputData = {}
# get the image data:
for placemark in soup.findAll('placemark'):
attributes = {}
Name = placemark.find_all('name')
attributes['Name'] = Name[0].string
print attributes['Name']
'''
Эта штука должна вытаскивать данные из <style><LabelStyle>, но почему-то не вытаскивает
Потом такая же конструкция должна вытаскивать данные из <ExtendedData><Data name="CELLID"><value>4881</value>
'''
style = soup.find('style')
print style.findAll('LabelStyle')
lss = style.findAll('LabelStyle')
ls = lss[0]
print ls
print '--------------------'
row = i.contents
'''
Тут собирается массив outputData с атрибутами, потом он выгружается в csv
'''
# add the current data to the dict
imageData = {}
imageData['Name'] = row[0].contents[0].string.encode("ascii","ignore")
imageData['Description'] = row[1].contents[0].string.encode("ascii","ignore")
imageData['Time'] = row[3].contents[0].string.encode("ascii","ignore")
coord = row[5].coordinates.contents[0].string.encode("ascii","ignore")
imageData['Lon'] = coord.split(',')[0]
imageData['Lat'] = coord.split(',')[1]
# add this image's data to the list
outputData.append(imageData)
#print outputData
# create the output file
out = codecs.open(os.getcwd() + "/out.csv", 'w',"utf-8")
firstRun = 1
print "Writing output file: "+ out.name
try:
fieldnames = sorted(outputData[0].keys())
fieldnames.reverse()
writer = csv.DictWriter(out,dialect='excel', fieldnames=fieldnames, extrasaction='ignore', quoting=csv.QUOTE_NONNUMERIC)
headers = dict( (n,n) for n in fieldnames )
writer.writerow(headers)
for row in outputData:
writer.writerow(row)
finally:
out.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment