Created
December 8, 2016 14:21
-
-
Save trolleway/9f2c4cb40d0984b0f31e5004df856c1b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# parse each file from the photo collection and export data into CSV. | |
# will need: os.listdir(path) | |
#http://robertcarlsen.net/2010/02/23/parsing-foursquare-kml-files-1113 | |
import sys | |
import os | |
import codecs | |
import csv | |
from bs4 import BeautifulSoup | |
# get the file list: | |
if len(sys.argv) > 1: | |
dir = sys.argv[1] | |
else: | |
dir = os.getcwd() | |
file = dir | |
# create the output dictionary | |
outputData = [] | |
# sanity checking, only work on kml files | |
if file.endswith('.kml') == 0: sys.exit(-1) | |
print "Reading file: "+file | |
fh = codecs.open(file,'r',"utf-8") | |
html = fh.read() | |
fh.close() | |
soup = BeautifulSoup(html, 'lxml') | |
#print soup.prettify() | |
# create a new dictionary for the current image's data | |
imageData = dict(); | |
outputData = {} | |
# get the image data: | |
for placemark in soup.findAll('placemark'): | |
attributes = {} | |
Name = placemark.find_all('name') | |
attributes['Name'] = Name[0].string | |
print attributes['Name'] | |
''' | |
Эта штука должна вытаскивать данные из <style><LabelStyle>, но почему-то не вытаскивает | |
Потом такая же конструкция должна вытаскивать данные из <ExtendedData><Data name="CELLID"><value>4881</value> | |
''' | |
style = soup.find('style') | |
print style.findAll('LabelStyle') | |
lss = style.findAll('LabelStyle') | |
ls = lss[0] | |
print ls | |
print '--------------------' | |
row = i.contents | |
''' | |
Тут собирается массив outputData с атрибутами, потом он выгружается в csv | |
''' | |
# add the current data to the dict | |
imageData = {} | |
imageData['Name'] = row[0].contents[0].string.encode("ascii","ignore") | |
imageData['Description'] = row[1].contents[0].string.encode("ascii","ignore") | |
imageData['Time'] = row[3].contents[0].string.encode("ascii","ignore") | |
coord = row[5].coordinates.contents[0].string.encode("ascii","ignore") | |
imageData['Lon'] = coord.split(',')[0] | |
imageData['Lat'] = coord.split(',')[1] | |
# add this image's data to the list | |
outputData.append(imageData) | |
#print outputData | |
# create the output file | |
out = codecs.open(os.getcwd() + "/out.csv", 'w',"utf-8") | |
firstRun = 1 | |
print "Writing output file: "+ out.name | |
try: | |
fieldnames = sorted(outputData[0].keys()) | |
fieldnames.reverse() | |
writer = csv.DictWriter(out,dialect='excel', fieldnames=fieldnames, extrasaction='ignore', quoting=csv.QUOTE_NONNUMERIC) | |
headers = dict( (n,n) for n in fieldnames ) | |
writer.writerow(headers) | |
for row in outputData: | |
writer.writerow(row) | |
finally: | |
out.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment