akisatok · November 12, 2019 21:30
diff --git a/file0.txt b/file0.txt
 % sudo pip install s3cmd
diff --git a/file1.txt b/file1.txt
 % s3cmd --configure

 Enter new values or accept defaults in brackets with Enter.
 Refer to user manual for detailed description of all options.

 Access key and Secret key are your identifiers for Amazon S3. Leave them empty for using the env variables.
 Access Key : [AWSのアクセスキーIDを入れる] 
 Secret Key : [AWSのシークレットアクセスキーIDを入れる]
 Default Region : [Japanとか入れれば大丈夫]

 Encryption password is used to protect your files from reading
 by unauthorized persons while in transfer to S3
 Encryption password : [何らかのパスワードを入れて覚えておく．たぶん使わない．] 
 Path to GPG program : [gpgの絶対パスを入れる．なければインストール．] 

 When using secure HTTPS protocol all communication with Amazon S3
 servers is protected from 3rd party eavesdropping. This method is
 slower than plain HTTP, and can only be proxied with Python 2.7 or newer
 Use HTTPS protocol [Yes]: [ここは素直にyes]

 On some networks all internet access must go through a HTTP proxy.
 Try setting it here if you can't connect to S3 directly
 HTTP Proxy server name: [プロキシを使っている場合にはそのサーバ名とポート番号を入れる．]

 New settings:
  Access Key: ....................
  Secret Key: ....................
  Default Region: Japan
  Encryption password: ...........
  Path to GPG program: /usr/local/bin/gpg
  Use HTTPS protocol: True
  HTTP Proxy server name: 
  HTTP Proxy server port: 0

 Test access with supplied credentials? [Y/n] [何も入力せずリターン]
 Please wait, attempting to list all buckets...
 Success. Your access key and secret key worked fine :-)

 Now verifying that encryption works...
 Success. Encryption and decryption worked fine :-)

 Save settings? [y/N] [yを入力，設定が保存される．]
 Configuration saved to '/Users/akisato/.s3cfg'
diff --git a/file2.txt b/file2.txt
 % s3cmd ls -H s3://yahoo-webscope/I3set14/
 2017-03-08 16:54        10k  s3://yahoo-webscope/I3set14/WebscopeReadMe.txt
 2017-03-08 16:54         2G  s3://yahoo-webscope/I3set14/yfcc100m_autotags.bz2
 2017-03-08 16:54        13G  s3://yahoo-webscope/I3set14/yfcc100m_dataset.bz2
 2017-03-08 16:56         9G  s3://yahoo-webscope/I3set14/yfcc100m_exif.bz2
 2017-03-08 16:57      1710M  s3://yahoo-webscope/I3set14/yfcc100m_places.bz2
diff --git a/file3.txt b/file3.txt
 % s3cmd get --recursive s3://yahoo-webscope/I3set14/
diff --git a/file4.txt b/file4.txt
 % s3cmd get --recursive s3://yahoo-webscope/I3set14/ --continue
diff --git a/file5.txt b/file5.txt
 % find . -name "*.bz2" | xargs -L1 -n1 -I{} bzip2 -vd {}
 % mv yfcc100m_autodags yfcc100m_autotags.csv
 % mv yfcc100m_dataset yfcc100m_dataset.csv
 % mv yfcc100m_exif yfcc100m_exif.csv
 % mv yfcc100m_places yfcc100m_places.csv
diff --git a/file6.txt b/file6.txt
 % less yfcc100m_dataset.csv
 0       6985418911      4e2f7a26a1dfbf165a7e30bdabf7e72a        39089491@N00    nino63004       2012-02-16 09:56:37.0   1331840483      Canon+PowerShot+ELPH+310+HS
     IMG_0520                canon,canon+powershot+hs+310,carnival+escatay,cruise,elph,hs+310,key+west+florida,powershot             -81.804885      24.550558
       12      http://www.flickr.com/photos/39089491@N00/6985418911/   http://farm8.staticflickr.com/7205/6985418911_df7747990d.jpg    Attribution-NonCommercial-NoDerivs License      http://creativecommons.org/licenses/by-nc-nd/2.0/       7205
    8       df7747990d      692d7e0a7f      jpg     0
 ...
diff --git a/file8.txt b/file8.txt
 % python yfcc_download.,py
diff --git a/yfcc_createmeta.py b/yfcc_createmeta.py
 import json
 import os.path
 import subprocess

 def split_str(s, n):
    length = len(s)
    return [ s[i:i+n] for i in range(0, length, n) ]

 def extract_metadata(elems):
    if len(elems)<2: return None
    text = elems[1]
    text_split = text.split(',')
    d = dict()
    for elem in text_split:
        key = elem.split(':')[0]
        val = elem.split(':')[1]
        d[key] = val
    return d

 def extract_metadata_d(elems):
    if len(elems)<25: return None
    d = dict()
    d['photo_hash'] = elems[2]
    d['user_id'] = elems[3]
    d['user_nickname'] = elems[4]
    d['date_taken'] = elems[5]
    d['date_uploaded'] = elems[6]
    d['capture_device'] = elems[7]
    d['title'] = elems[8]
    d['description'] = elems[9]
    d['user_tags'] = elems[10]
    d['machine_tags'] = elems[11]
    d['longitude'] = elems[12]
    d['latitude'] = elems[13]
    d['pos_accuracy'] = elems[14]
    d['url_show'] = elems[15]
    d['url_get'] = elems[16]
    d['license_name'] = elems[17]
    d['license_url'] = elems[18]
    d['server_id'] = elems[19]
    d['farm_id'] = elems[20]
    d['photo_secret'] = elems[21]
    d['photo_secret_orig'] = elems[22]
    d['photo_ext'] = elems[23]
    d['photo_or_video'] = elems[24]
    return d

 fin_autotag = open('yfcc100m_autotags.csv')
 fin_exif    = open('yfcc100m_exif.csv')
 fin_places  = open('yfcc100m_places.csv')
 fin_dataset = open('yfcc100m_dataset.csv')
 metadir = './meta'

 while True:
    # read lines
    line_a = fin_autotag.readline()
    line_e = fin_exif.readline()
    line_p = fin_places.readline()
    line_d = fin_dataset.readline()
    if (not line_a) or (not line_e) or (not line_p) or (not line_d):
        break
    line_a_split = line_a.strip().split('\t')
    line_e_split = line_e.strip().split('\t')
    line_p_split = line_p.strip().split('\t')
    line_d_split = line_d.strip().split('\t')
    # check photo ID
    photo_id_a = int(line_a_split[0])
    photo_id_e = int(line_e_split[0])
    photo_id_p = int(line_p_split[0])
    photo_id_d = int(line_d_split[1])
    if photo_id_a!=photo_id_e or photo_id_e!= photo_id_p or photo_id_p!=photo_id_d:
        print 'Photo ID mismatched.'
        continue
    photo_id = photo_id_a
    # check existing files
    split_photo_id = split_str(str(photo_id), 3)
    json_dir = os.path.join(metadir, split_photo_id[0], split_photo_id[1])
    json_path = os.path.join(json_dir, str(photo_id)+'_meta.json')
    if os.path.isfile(json_path) and os.path.getsize(json_path):
        print 'Photo ID %d metadata already exists, skip.' % photo_id
        continue
    print 'Photo ID %d metadata creating...' % photo_id
    subprocess.call('mkdir -p ' + json_dir, shell=True)
    # extract metadata
    autotags = extract_metadata(line_a_split)
    exif = extract_metadata(line_e_split)
    places = extract_metadata(line_p_split)
    othermeta = extract_metadata_d(line_d_split)
    # form JSON data and write it to a file
    json_data = dict()
    if autotags: json_data['autotags'] = autotags
    if exif: json_data['EXIF'] = exif
    if places: json_data['places'] = places
    if othermeta: json_data['othermeta'] = othermeta
    with open(json_path, 'wb') as fout:
        json.dump(json_data, fout, sort_keys=True, indent=4)
diff --git a/yfcc_download.py b/yfcc_download.py
 import urllib2
 import os.path
 import subprocess

 def split_str(s, n):
    length = len(s)
    return [ s[i:i+n] for i in range(0, length, n) ]

 def img_download(url, filename):
    img = urllib2.urlopen(url)
    fout = open(filename, 'wb')
    fout.write(img.read())
    img.close()
    fout.close()

 fin = open('./yfcc100m_dataset.csv')
 imgdir = './img'

 print 'Start downloading YFCC100M dataset...'
 for line in fin:
    line_split = line.strip().split('\t')
    line_num = int(line_split[0])
    photo_id = int(line_split[1])    # photo id
    photo_url = line_split[16]    # photo URL for downloading
    photo_ext = os.path.splitext(photo_url)[1]
    if photo_ext=='':
        photo_ext = '.mp4'
    split_photo_id = split_str(str(photo_id), 3)
    photo_dir = os.path.join(imgdir, split_photo_id[0], split_photo_id[1])
    photo_name = os.path.join(photo_dir, str(photo_id)+photo_ext)
    if os.path.isfile(photo_name) and os.path.getsize(photo_name):
        print 'Line %d, id %d, skipped' % (line_num, photo_id)
        continue    # avoid duplicate downloading
    print 'Line %d, id %d, download' % (line_num, photo_id)
    try:
        subprocess.call('mkdir -p ' + photo_dir, shell=True)
        img_download(photo_url, photo_name)
    except:
        print 'Failed'
	% s3cmd --configure

	Enter new values or accept defaults in brackets with Enter.
	Refer to user manual for detailed description of all options.

	Access key and Secret key are your identifiers for Amazon S3. Leave them empty for using the env variables.
	Access Key : [AWSのアクセスキーIDを入れる]
	Secret Key : [AWSのシークレットアクセスキーIDを入れる]
	Default Region : [Japanとか入れれば大丈夫]

	Encryption password is used to protect your files from reading
	by unauthorized persons while in transfer to S3
	Encryption password : [何らかのパスワードを入れて覚えておく．たぶん使わない．]
	Path to GPG program : [gpgの絶対パスを入れる．なければインストール．]

	When using secure HTTPS protocol all communication with Amazon S3
	servers is protected from 3rd party eavesdropping. This method is
	slower than plain HTTP, and can only be proxied with Python 2.7 or newer
	Use HTTPS protocol [Yes]: [ここは素直にyes]

	On some networks all internet access must go through a HTTP proxy.
	Try setting it here if you can't connect to S3 directly
	HTTP Proxy server name: [プロキシを使っている場合にはそのサーバ名とポート番号を入れる．]

	New settings:
	Access Key: ....................
	Secret Key: ....................
	Default Region: Japan
	Encryption password: ...........
	Path to GPG program: /usr/local/bin/gpg
	Use HTTPS protocol: True
	HTTP Proxy server name:
	HTTP Proxy server port: 0

	Test access with supplied credentials? [Y/n] [何も入力せずリターン]
	Please wait, attempting to list all buckets...
	Success. Your access key and secret key worked fine :-)

	Now verifying that encryption works...
	Success. Encryption and decryption worked fine :-)

	Save settings? [y/N] [yを入力，設定が保存される．]
	Configuration saved to '/Users/akisato/.s3cfg'
	% s3cmd ls -H s3://yahoo-webscope/I3set14/
	2017-03-08 16:54 10k s3://yahoo-webscope/I3set14/WebscopeReadMe.txt
	2017-03-08 16:54 2G s3://yahoo-webscope/I3set14/yfcc100m_autotags.bz2
	2017-03-08 16:54 13G s3://yahoo-webscope/I3set14/yfcc100m_dataset.bz2
	2017-03-08 16:56 9G s3://yahoo-webscope/I3set14/yfcc100m_exif.bz2
	2017-03-08 16:57 1710M s3://yahoo-webscope/I3set14/yfcc100m_places.bz2
	% find . -name "*.bz2" \| xargs -L1 -n1 -I{} bzip2 -vd {}
	% mv yfcc100m_autodags yfcc100m_autotags.csv
	% mv yfcc100m_dataset yfcc100m_dataset.csv
	% mv yfcc100m_exif yfcc100m_exif.csv
	% mv yfcc100m_places yfcc100m_places.csv
	% less yfcc100m_dataset.csv
	0 6985418911 4e2f7a26a1dfbf165a7e30bdabf7e72a 39089491@N00 nino63004 2012-02-16 09:56:37.0 1331840483 Canon+PowerShot+ELPH+310+HS
	IMG_0520 canon,canon+powershot+hs+310,carnival+escatay,cruise,elph,hs+310,key+west+florida,powershot -81.804885 24.550558
	12 http://www.flickr.com/photos/39089491@N00/6985418911/ http://farm8.staticflickr.com/7205/6985418911_df7747990d.jpg Attribution-NonCommercial-NoDerivs License http://creativecommons.org/licenses/by-nc-nd/2.0/ 7205
	8 df7747990d 692d7e0a7f jpg 0
	...
	import json
	import os.path
	import subprocess

	def split_str(s, n):
	length = len(s)
	return [ s[i:i+n] for i in range(0, length, n) ]

	def extract_metadata(elems):
	if len(elems)<2: return None
	text = elems[1]
	text_split = text.split(',')
	d = dict()
	for elem in text_split:
	key = elem.split(':')[0]
	val = elem.split(':')[1]
	d[key] = val
	return d

	def extract_metadata_d(elems):
	if len(elems)<25: return None
	d = dict()
	d['photo_hash'] = elems[2]
	d['user_id'] = elems[3]
	d['user_nickname'] = elems[4]
	d['date_taken'] = elems[5]
	d['date_uploaded'] = elems[6]
	d['capture_device'] = elems[7]
	d['title'] = elems[8]
	d['description'] = elems[9]
	d['user_tags'] = elems[10]
	d['machine_tags'] = elems[11]
	d['longitude'] = elems[12]
	d['latitude'] = elems[13]
	d['pos_accuracy'] = elems[14]
	d['url_show'] = elems[15]
	d['url_get'] = elems[16]
	d['license_name'] = elems[17]
	d['license_url'] = elems[18]
	d['server_id'] = elems[19]
	d['farm_id'] = elems[20]
	d['photo_secret'] = elems[21]
	d['photo_secret_orig'] = elems[22]
	d['photo_ext'] = elems[23]
	d['photo_or_video'] = elems[24]
	return d

	fin_autotag = open('yfcc100m_autotags.csv')
	fin_exif = open('yfcc100m_exif.csv')
	fin_places = open('yfcc100m_places.csv')
	fin_dataset = open('yfcc100m_dataset.csv')
	metadir = './meta'

	while True:
	# read lines
	line_a = fin_autotag.readline()
	line_e = fin_exif.readline()
	line_p = fin_places.readline()
	line_d = fin_dataset.readline()
	if (not line_a) or (not line_e) or (not line_p) or (not line_d):
	break
	line_a_split = line_a.strip().split('\t')
	line_e_split = line_e.strip().split('\t')
	line_p_split = line_p.strip().split('\t')
	line_d_split = line_d.strip().split('\t')
	# check photo ID
	photo_id_a = int(line_a_split[0])
	photo_id_e = int(line_e_split[0])
	photo_id_p = int(line_p_split[0])
	photo_id_d = int(line_d_split[1])
	if photo_id_a!=photo_id_e or photo_id_e!= photo_id_p or photo_id_p!=photo_id_d:
	print 'Photo ID mismatched.'
	continue
	photo_id = photo_id_a
	# check existing files
	split_photo_id = split_str(str(photo_id), 3)
	json_dir = os.path.join(metadir, split_photo_id[0], split_photo_id[1])
	json_path = os.path.join(json_dir, str(photo_id)+'_meta.json')
	if os.path.isfile(json_path) and os.path.getsize(json_path):
	print 'Photo ID %d metadata already exists, skip.' % photo_id
	continue
	print 'Photo ID %d metadata creating...' % photo_id
	subprocess.call('mkdir -p ' + json_dir, shell=True)
	# extract metadata
	autotags = extract_metadata(line_a_split)
	exif = extract_metadata(line_e_split)
	places = extract_metadata(line_p_split)
	othermeta = extract_metadata_d(line_d_split)
	# form JSON data and write it to a file
	json_data = dict()
	if autotags: json_data['autotags'] = autotags
	if exif: json_data['EXIF'] = exif
	if places: json_data['places'] = places
	if othermeta: json_data['othermeta'] = othermeta
	with open(json_path, 'wb') as fout:
	json.dump(json_data, fout, sort_keys=True, indent=4)
	import urllib2
	import os.path
	import subprocess

	def split_str(s, n):
	length = len(s)
	return [ s[i:i+n] for i in range(0, length, n) ]

	def img_download(url, filename):
	img = urllib2.urlopen(url)
	fout = open(filename, 'wb')
	fout.write(img.read())
	img.close()
	fout.close()

	fin = open('./yfcc100m_dataset.csv')
	imgdir = './img'

	print 'Start downloading YFCC100M dataset...'
	for line in fin:
	line_split = line.strip().split('\t')
	line_num = int(line_split[0])
	photo_id = int(line_split[1]) # photo id
	photo_url = line_split[16] # photo URL for downloading
	photo_ext = os.path.splitext(photo_url)[1]
	if photo_ext=='':
	photo_ext = '.mp4'
	split_photo_id = split_str(str(photo_id), 3)
	photo_dir = os.path.join(imgdir, split_photo_id[0], split_photo_id[1])
	photo_name = os.path.join(photo_dir, str(photo_id)+photo_ext)
	if os.path.isfile(photo_name) and os.path.getsize(photo_name):
	print 'Line %d, id %d, skipped' % (line_num, photo_id)
	continue # avoid duplicate downloading
	print 'Line %d, id %d, download' % (line_num, photo_id)
	try:
	subprocess.call('mkdir -p ' + photo_dir, shell=True)
	img_download(photo_url, photo_name)
	except:
	print 'Failed'