guimatheus92 · August 17, 2021 19:54
diff --git a/fullcombined_data.py b/fullcombined_data.py
 # From TXT files, we are going to create a full combined data
 # If the file does not exist, we create the file in write mode (w)
 if not os.path.isfile(r'kaggle/working/fullcombined_data.csv'):    
    # Process start time
    start = datetime.now()

    # Path and files that is going to be combined to a full file
    files = [r'kaggle/input/netflix-prize-data/combined_data_1.txt',
                r'kaggle/input/netflix-prize-data/combined_data_2.txt', 
                r'kaggle/input/netflix-prize-data/combined_data_3.txt', 
                r'kaggle/input/netflix-prize-data/combined_data_4.txt']

    filecount = 0
    for file in files:
        if os.path.isfile(file):
            filecount += 1

    # If there is any combined_data*.txt file in path
    if filecount > 0:
        # Open and create file for recording
        dataset = open(r'kaggle/working/fullcombined_data.csv', mode = 'w')    
        # Create list for files rows
        rows = list()    
        
        # Loop for each TXT file
        for file in files:        
            # Print a message
            print("Reading the file {}...".format(file))        
            # With the file open, we extract the rows
            with open(file) as f:
                # Loop through each row
                for row in f:
                    # Deleting list content
                    del rows[:]                 
                    # Divide the row of the file by the end of line character
                    row = row.strip()                
                    # If we find "colon" at the end of the row, we do replace by removing the character, as we just want the movie id
                    if row.endswith(':'):
                        movie_id = row.replace(':', '')                    
                    # If not, we create a comprehension list to separate the columns by comma
                    else:                    
                        # Split the columns
                        rows = [x for x in row.split(',')]                    
                        # Use movie id at index zero position
                        rows.insert(0, movie_id)                    
                        # Write the result to the new file
                        dataset.write(','.join(rows))
                        dataset.write('\n')                    
            print("Finished.\n")        
        dataset.close()

        # Print elapsed time
        print('Elapsed time to load all the files:', datetime.now() - start)
	# From TXT files, we are going to create a full combined data
	# If the file does not exist, we create the file in write mode (w)
	if not os.path.isfile(r'kaggle/working/fullcombined_data.csv'):
	# Process start time
	start = datetime.now()

	# Path and files that is going to be combined to a full file
	files = [r'kaggle/input/netflix-prize-data/combined_data_1.txt',
	r'kaggle/input/netflix-prize-data/combined_data_2.txt',
	r'kaggle/input/netflix-prize-data/combined_data_3.txt',
	r'kaggle/input/netflix-prize-data/combined_data_4.txt']

	filecount = 0
	for file in files:
	if os.path.isfile(file):
	filecount += 1

	# If there is any combined_data*.txt file in path
	if filecount > 0:
	# Open and create file for recording
	dataset = open(r'kaggle/working/fullcombined_data.csv', mode = 'w')
	# Create list for files rows
	rows = list()

	# Loop for each TXT file
	for file in files:
	# Print a message
	print("Reading the file {}...".format(file))
	# With the file open, we extract the rows
	with open(file) as f:
	# Loop through each row
	for row in f:
	# Deleting list content
	del rows[:]
	# Divide the row of the file by the end of line character
	row = row.strip()
	# If we find "colon" at the end of the row, we do replace by removing the character, as we just want the movie id
	if row.endswith(':'):
	movie_id = row.replace(':', '')
	# If not, we create a comprehension list to separate the columns by comma
	else:
	# Split the columns
	rows = [x for x in row.split(',')]
	# Use movie id at index zero position
	rows.insert(0, movie_id)
	# Write the result to the new file
	dataset.write(','.join(rows))
	dataset.write('\n')
	print("Finished.\n")
	dataset.close()

	# Print elapsed time
	print('Elapsed time to load all the files:', datetime.now() - start)