magixx · August 29, 2015 14:12
diff --git a/number_clustering.py b/number_clustering.py
 import regex
 test_release = 'Release.Name.v1.3.0.DISC.1.Other.Tags.x.264.<RESOLUTION>.WMV'
 
 memory = []
 main = []
 split_release = test_release.split('.')
 split_release.reverse()
 split_release_copy = split_release[:]

 # Tags that I want to know about
 number_reg = regex.compile('(?:(?P<disc>DISC)|(?P<update>UPDATE)|(?P<build>BUILD)|(?P<version>v\d+)|'
                           '(?P<codec>\Ax|h))\Z', 258)
 # Special functions for those found tags
 extra_modifiers = {'codec': lambda memory_list: ''.join(memory),
                   'disc': lambda memory_list: disc_join_plus(memory)}


 def disc_join_plus(memory_list):
    print 'DISC: {}'.format(memory[1:])
    return '<DISC>'


 def is_numberlike(string):
    try:
        float(string)
        return True
    except ValueError:
        if string[0].isdigit() and string[-1].isalpha() and (len(filter(str.isdigit, string))/len(string) >= 0.5):
            return True
        return False

 for k, v in enumerate(split_release):
    try:
        # prints out what item your on nicely
        nice = ''
        marker = ''
        for i in split_release:
            nice += i + '\t'
        for i in range(0, k):
            marker += '\t'
        print nice
        print marker + '^'

        # item always put into memory
        memory.append(split_release_copy.pop(0))
        print('Main string is {} | Current memory: {}'.format(main, memory))
        do_special_work = number_reg.search(v)
        # item always put in tracked

        # if it's a special known tag type change it + the memory items to whatever
        if do_special_work and len(memory) > 1:

            # reverse the memory to original state
            memory.reverse()

            # see what regex group matched
            tag_info = list(next((group_name, group_value) for group_name, group_value in
                            do_special_work.groupdict().iteritems() if group_value))

            # do a regular do join on the numbers or something special if specified
            main.append( extra_modifiers[tag_info[0]](memory) if tag_info[0] in extra_modifiers else '.'.join(memory))

            # clear the memory
            memory = []

        elif is_numberlike(v) and k < len(split_release):
            # This item is a number (and not first item in original string) so it must belong to the upcoming item
            continue
        else:
            memory.reverse()
            # I might want to skip the join and directly add all items to memory 'as is' or something different,
            # i.e. this item (main.append(memory.pop())) >> main, then join rest of memory to main, however it seems
            # that numbers and number-like items should always belong to an item
            main.append('.'.join(memory))

            # clear the memory
            memory = []

    except IndexError as e:
        print e
 split_release.reverse()
 print '{} became:'.format(split_release)
 main.reverse()
 print main
 ---------------
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 ^
 Main string is [] | Current memory: ['WMV']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 	^
 Main string is ['WMV'] | Current memory: ['<RESOLUTION>']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 		^
 Main string is ['WMV', '<RESOLUTION>'] | Current memory: ['264']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 			^
 Main string is ['WMV', '<RESOLUTION>'] | Current memory: ['264', 'x']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 				^
 Main string is ['WMV', '<RESOLUTION>', 'x264'] | Current memory: ['Tags']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 					^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags'] | Current memory: ['Other']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 						^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other'] | Current memory: ['1']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 							^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other'] | Current memory: ['1', 'DISC']
 DISC: ['1']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 								^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>'] | Current memory: ['0']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 									^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>'] | Current memory: ['0', '3']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 										^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>'] | Current memory: ['0', '3', 'v1']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 											^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>', 'v1.3.0'] | Current memory: ['Name']
 WMV	<RESOLUTION>	264	x	Tags	Other	1	DISC	0	3	v1	Name	Release	
 												^
 Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>', 'v1.3.0', 'Name'] | Current memory: ['Release']
 ['Release', 'Name', 'v1', '3', '0', 'DISC', '1', 'Other', 'Tags', 'x', '264', '<RESOLUTION>', 'WMV'] became:
 ['Release', 'Name', 'v1.3.0', '<DISC>', 'Other', 'Tags', 'x264', '<RESOLUTION>', 'WMV']
	import regex
	test_release = 'Release.Name.v1.3.0.DISC.1.Other.Tags.x.264.<RESOLUTION>.WMV'

	memory = []
	main = []
	split_release = test_release.split('.')
	split_release.reverse()
	split_release_copy = split_release[:]

	# Tags that I want to know about
	number_reg = regex.compile('(?:(?P<disc>DISC)\|(?P<update>UPDATE)\|(?P<build>BUILD)\|(?P<version>v\d+)\|'
	'(?P<codec>\Ax\|h))\Z', 258)
	# Special functions for those found tags
	extra_modifiers = {'codec': lambda memory_list: ''.join(memory),
	'disc': lambda memory_list: disc_join_plus(memory)}


	def disc_join_plus(memory_list):
	print 'DISC: {}'.format(memory[1:])
	return '<DISC>'


	def is_numberlike(string):
	try:
	float(string)
	return True
	except ValueError:
	if string[0].isdigit() and string[-1].isalpha() and (len(filter(str.isdigit, string))/len(string) >= 0.5):
	return True
	return False

	for k, v in enumerate(split_release):
	try:
	# prints out what item your on nicely
	nice = ''
	marker = ''
	for i in split_release:
	nice += i + '\t'
	for i in range(0, k):
	marker += '\t'
	print nice
	print marker + '^'

	# item always put into memory
	memory.append(split_release_copy.pop(0))
	print('Main string is {} \| Current memory: {}'.format(main, memory))
	do_special_work = number_reg.search(v)
	# item always put in tracked

	# if it's a special known tag type change it + the memory items to whatever
	if do_special_work and len(memory) > 1:

	# reverse the memory to original state
	memory.reverse()

	# see what regex group matched
	tag_info = list(next((group_name, group_value) for group_name, group_value in
	do_special_work.groupdict().iteritems() if group_value))

	# do a regular do join on the numbers or something special if specified
	main.append( extra_modifiers[tag_info[0]](memory) if tag_info[0] in extra_modifiers else '.'.join(memory))

	# clear the memory
	memory = []

	elif is_numberlike(v) and k < len(split_release):
	# This item is a number (and not first item in original string) so it must belong to the upcoming item
	continue
	else:
	memory.reverse()
	# I might want to skip the join and directly add all items to memory 'as is' or something different,
	# i.e. this item (main.append(memory.pop())) >> main, then join rest of memory to main, however it seems
	# that numbers and number-like items should always belong to an item
	main.append('.'.join(memory))

	# clear the memory
	memory = []

	except IndexError as e:
	print e
	split_release.reverse()
	print '{} became:'.format(split_release)
	main.reverse()
	print main
	---------------
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is [] \| Current memory: ['WMV']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV'] \| Current memory: ['<RESOLUTION>']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>'] \| Current memory: ['264']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>'] \| Current memory: ['264', 'x']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264'] \| Current memory: ['Tags']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags'] \| Current memory: ['Other']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other'] \| Current memory: ['1']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other'] \| Current memory: ['1', 'DISC']
	DISC: ['1']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>'] \| Current memory: ['0']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>'] \| Current memory: ['0', '3']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>'] \| Current memory: ['0', '3', 'v1']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>', 'v1.3.0'] \| Current memory: ['Name']
	WMV <RESOLUTION> 264 x Tags Other 1 DISC 0 3 v1 Name Release
	^
	Main string is ['WMV', '<RESOLUTION>', 'x264', 'Tags', 'Other', '<DISC>', 'v1.3.0', 'Name'] \| Current memory: ['Release']
	['Release', 'Name', 'v1', '3', '0', 'DISC', '1', 'Other', 'Tags', 'x', '264', '<RESOLUTION>', 'WMV'] became:
	['Release', 'Name', 'v1.3.0', '<DISC>', 'Other', 'Tags', 'x264', '<RESOLUTION>', 'WMV']