Last active
August 17, 2021 19:54
-
-
Save guimatheus92/5bf038f94abe46056b79a0a3a640e1bd to your computer and use it in GitHub Desktop.
Netflix Recommendation System: From Netflix TXT files, we are going to create a full combined data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# From TXT files, we are going to create a full combined data | |
# If the file does not exist, we create the file in write mode (w) | |
if not os.path.isfile(r'kaggle/working/fullcombined_data.csv'): | |
# Process start time | |
start = datetime.now() | |
# Path and files that is going to be combined to a full file | |
files = [r'kaggle/input/netflix-prize-data/combined_data_1.txt', | |
r'kaggle/input/netflix-prize-data/combined_data_2.txt', | |
r'kaggle/input/netflix-prize-data/combined_data_3.txt', | |
r'kaggle/input/netflix-prize-data/combined_data_4.txt'] | |
filecount = 0 | |
for file in files: | |
if os.path.isfile(file): | |
filecount += 1 | |
# If there is any combined_data*.txt file in path | |
if filecount > 0: | |
# Open and create file for recording | |
dataset = open(r'kaggle/working/fullcombined_data.csv', mode = 'w') | |
# Create list for files rows | |
rows = list() | |
# Loop for each TXT file | |
for file in files: | |
# Print a message | |
print("Reading the file {}...".format(file)) | |
# With the file open, we extract the rows | |
with open(file) as f: | |
# Loop through each row | |
for row in f: | |
# Deleting list content | |
del rows[:] | |
# Divide the row of the file by the end of line character | |
row = row.strip() | |
# If we find "colon" at the end of the row, we do replace by removing the character, as we just want the movie id | |
if row.endswith(':'): | |
movie_id = row.replace(':', '') | |
# If not, we create a comprehension list to separate the columns by comma | |
else: | |
# Split the columns | |
rows = [x for x in row.split(',')] | |
# Use movie id at index zero position | |
rows.insert(0, movie_id) | |
# Write the result to the new file | |
dataset.write(','.join(rows)) | |
dataset.write('\n') | |
print("Finished.\n") | |
dataset.close() | |
# Print elapsed time | |
print('Elapsed time to load all the files:', datetime.now() - start) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment