paulll · March 14, 2020 19:55
diff --git a/ytmhist2csv.py b/ytmhist2csv.py
 import json, re, sys

 items = json.load(open('watch-history.json'))
 music = (list(x for x in items if x['header'] == 'YouTube Music'))

 known_music_channels = {
 	'Reol Official': 'Reol'
 }

 counters = {
 	'Total': len(music),
 	'Native (GPM/YTM)': 0,
 	'Known': 0,
 	'Guessed': 0,
 	'Fallback': 0,
 	'Removed': 0,
 	'Without metadata': 0,
 }


 for track in music:
 	if not track['title'].startswith('Watched '):
 		print(track)
 		exit(1)

 	fullname = track['title'][8:]

 	parts = re.split('\\s+[–—-]\\s+', fullname)
 	artist = 'unknown'
 	title = 'unknown'

 	if len(parts) < 2:
 		if 'subtitles' in track and track['subtitles'][0]['name'].endswith(' - Topic'):

 			# Native YTM/GPM artist
 			artist = track['subtitles'][0]['name'][0:-8]
 			title = fullname
 			counters['Native (GPM/YTM)'] += 1
 		elif 'watch?' in track['title']:


 			# no metadata at all :shrug: 
 			counters['Without metadata'] += 1
 			continue 

 		elif 'a video that has been removed' in track['title']:
 			
 			# :shrug: 
 			counters['Removed'] += 1
 			continue
 		elif 'subtitles' in track and track['subtitles'][0]['name'] in known_music_channels:
 			
 			# known authors on youtube
 			title = fullname
 			artist = known_music_channels[track['subtitles'][0]['name']]
 			counters['Known'] += 1
 		else:
 			
 			# 3rd-party upload without ' - ' sign 
 			# trying to guess

 			# 1. Asian-style
 			match_quotes_a = re.search('「.*?」', fullname)
 			match_quotes_b = re.search('【.*?】', fullname)
 			match_quotes_c = re.search('『.*?』', fullname)
 			

 			if match_quotes_a:
 				title = match_quotes_a[0][1:-1]
 			if match_quotes_b:
 				artist = match_quotes_b[0][1:-1]

 			if match_quotes_c:
 				artist = match_quotes_c[0][1:-1]	
 				title = fullname[match_quotes_c.end():].strip()

 			# 2. artist: title style
 			n_parts = re.split(':\\s+', fullname)
 			if len(n_parts) == 2:
 				artist = n_parts[0]
 				title = n_parts[1]

 			# 3. title by artist style
 			n_parts = re.split('\\s+by\\s+', fullname)
 			if len(n_parts) == 2:
 				artist = n_parts[1]
 				title = n_parts[0]

 			if (title != 'unknown') and (artist != 'unknown'):
 				counters['Guessed'] += 1	
 				
 	else:
 		title = parts[1]
 		artist = parts[0]
 		counters['Guessed'] += 1

 	# fallback
 	if artist == 'unknown' or title == 'unknown':
 		counters['Fallback'] += 1
 	if artist == 'unknown' and 'subtitles' in track:
 		artist = track['subtitles'][0]['name']
 	if title == 'unknown':
 		title = fullname

 	# clean mess
 	artist = re.sub('【.*】', '', artist)	
 	artist = artist.strip()
 	title = title.strip()

 	#print('"{}","{}","","{}","",""'.format(artist,title,re.sub('\\.\\d{3}$', '', track['time'].replace('T', ' ').replace('Z', ''))))
 	print(artist, '\t', title)

 print('-----------------', file=sys.stderr)
 for (counter, value) in counters.items():
 	print('{}: {}'.format(counter, value), file=sys.stderr)
	import json, re, sys

	items = json.load(open('watch-history.json'))
	music = (list(x for x in items if x['header'] == 'YouTube Music'))

	known_music_channels = {
	'Reol Official': 'Reol'
	}

	counters = {
	'Total': len(music),
	'Native (GPM/YTM)': 0,
	'Known': 0,
	'Guessed': 0,
	'Fallback': 0,
	'Removed': 0,
	'Without metadata': 0,
	}


	for track in music:
	if not track['title'].startswith('Watched '):
	print(track)
	exit(1)

	fullname = track['title'][8:]

	parts = re.split('\\s+[–—-]\\s+', fullname)
	artist = 'unknown'
	title = 'unknown'

	if len(parts) < 2:
	if 'subtitles' in track and track['subtitles'][0]['name'].endswith(' - Topic'):

	# Native YTM/GPM artist
	artist = track['subtitles'][0]['name'][0:-8]
	title = fullname
	counters['Native (GPM/YTM)'] += 1
	elif 'watch?' in track['title']:


	# no metadata at all :shrug:
	counters['Without metadata'] += 1
	continue

	elif 'a video that has been removed' in track['title']:

	# :shrug:
	counters['Removed'] += 1
	continue
	elif 'subtitles' in track and track['subtitles'][0]['name'] in known_music_channels:

	# known authors on youtube
	title = fullname
	artist = known_music_channels[track['subtitles'][0]['name']]
	counters['Known'] += 1
	else:

	# 3rd-party upload without ' - ' sign
	# trying to guess

	# 1. Asian-style
	match_quotes_a = re.search('「.*?」', fullname)
	match_quotes_b = re.search('【.*?】', fullname)
	match_quotes_c = re.search('『.*?』', fullname)


	if match_quotes_a:
	title = match_quotes_a[0][1:-1]
	if match_quotes_b:
	artist = match_quotes_b[0][1:-1]

	if match_quotes_c:
	artist = match_quotes_c[0][1:-1]
	title = fullname[match_quotes_c.end():].strip()

	# 2. artist: title style
	n_parts = re.split(':\\s+', fullname)
	if len(n_parts) == 2:
	artist = n_parts[0]
	title = n_parts[1]

	# 3. title by artist style
	n_parts = re.split('\\s+by\\s+', fullname)
	if len(n_parts) == 2:
	artist = n_parts[1]
	title = n_parts[0]

	if (title != 'unknown') and (artist != 'unknown'):
	counters['Guessed'] += 1

	else:
	title = parts[1]
	artist = parts[0]
	counters['Guessed'] += 1

	# fallback
	if artist == 'unknown' or title == 'unknown':
	counters['Fallback'] += 1
	if artist == 'unknown' and 'subtitles' in track:
	artist = track['subtitles'][0]['name']
	if title == 'unknown':
	title = fullname

	# clean mess
	artist = re.sub('【.*】', '', artist)
	artist = artist.strip()
	title = title.strip()

	#print('"{}","{}","","{}","",""'.format(artist,title,re.sub('\\.\\d{3}$', '', track['time'].replace('T', ' ').replace('Z', ''))))
	print(artist, '\t', title)

	print('-----------------', file=sys.stderr)
	for (counter, value) in counters.items():
	print('{}: {}'.format(counter, value), file=sys.stderr)