cmaureir · March 7, 2019 12:14
diff --git a/compare.py b/compare.py
 import numpy as np
 import pandas as pd

 # content from '1.csv'
 #
 # a,b,c,d
 # 1,0.1,"a-10","hello"
 # 2,0.2,"a-11","hola"
 # 3,0.3,"a-12","hallo"
 # 4,0.5,"a-13","hello"

 # content from '2.csv'
 # other,other_c
 # 3.1,"a-99"
 # 4.2,"a-10"
 # 6.3,"a-22"
 # 1.5,"a-11"

 df_a = pd.read_csv("1.csv")
 df_b = pd.read_csv("2.csv")

 a = df_a[['a', 'b', 'c']]
 b = df_b[['other', 'other_c']]

 print(a)
 print(b)
 print("-"*10)


 print("values from `1.csv` that are in `2.csv` based on column `c`")
 contains = a[b['other_c'].apply(lambda x: x in a['c'].values)]
 print(contains)
 print("-"*10)

 print("values from `1.csv` that are NOT in `2.csv` based on column `c`")
 diff = a[b['other_c'].apply(lambda x: x not in a['c'].values)]
 print(diff)

 # Output
 #
 #    a    b     c
 # 0  1  0.1  a-10
 # 1  2  0.2  a-11
 # 2  3  0.3  a-12
 # 3  4  0.5  a-13
 #    other other_c
 # 0    3.1    a-99
 # 1    4.2    a-10
 # 2    6.3    a-22
 # 3    1.5    a-11
 # ----------
 # values from `1.csv` that are in `2.csv` based on column `c`
 #    a    b     c
 # 1  2  0.2  a-11
 # 3  4  0.5  a-13
 # ----------
 # values from `1.csv` that are NOT in `2.csv` based on column `c`
 #    a    b     c
 # 0  1  0.1  a-10
 # 2  3  0.3  a-12
 #
	import numpy as np
	import pandas as pd

	# content from '1.csv'
	#
	# a,b,c,d
	# 1,0.1,"a-10","hello"
	# 2,0.2,"a-11","hola"
	# 3,0.3,"a-12","hallo"
	# 4,0.5,"a-13","hello"

	# content from '2.csv'
	# other,other_c
	# 3.1,"a-99"
	# 4.2,"a-10"
	# 6.3,"a-22"
	# 1.5,"a-11"

	df_a = pd.read_csv("1.csv")
	df_b = pd.read_csv("2.csv")

	a = df_a[['a', 'b', 'c']]
	b = df_b[['other', 'other_c']]

	print(a)
	print(b)
	print("-"*10)


	print("values from `1.csv` that are in `2.csv` based on column `c`")
	contains = a[b['other_c'].apply(lambda x: x in a['c'].values)]
	print(contains)
	print("-"*10)

	print("values from `1.csv` that are NOT in `2.csv` based on column `c`")
	diff = a[b['other_c'].apply(lambda x: x not in a['c'].values)]
	print(diff)

	# Output
	#
	# a b c
	# 0 1 0.1 a-10
	# 1 2 0.2 a-11
	# 2 3 0.3 a-12
	# 3 4 0.5 a-13
	# other other_c
	# 0 3.1 a-99
	# 1 4.2 a-10
	# 2 6.3 a-22
	# 3 1.5 a-11
	# ----------
	# values from `1.csv` that are in `2.csv` based on column `c`
	# a b c
	# 1 2 0.2 a-11
	# 3 4 0.5 a-13
	# ----------
	# values from `1.csv` that are NOT in `2.csv` based on column `c`
	# a b c
	# 0 1 0.1 a-10
	# 2 3 0.3 a-12
	#