Last active
January 14, 2016 18:05
-
-
Save matheusportela/e7b6cd740ed14774e87b to your computer and use it in GitHub Desktop.
This script splits data loaded via GraphLab with two methods: SFrame.random_split and sklearn.cross_validation.train_test_split. In the end, present the number of different rows between each divided array.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import graphlab | |
import numpy as np | |
from sklearn.cross_validation import train_test_split | |
def main(): | |
sales = graphlab.SFrame('kc_house_data.gl') | |
train_data_graphlab, test_data_graphlab = sales.random_split(0.8, seed=0) | |
input_graphlab = train_data_graphlab['sqft_living'] | |
output_graphlab = train_data_graphlab['price'] | |
data = sales['sqft_living', 'price'].to_numpy() | |
train_data_sklearn, test_data_sklearn = train_test_split(data, test_size=0.2, random_state=0) | |
input_sklearn = train_data_sklearn[:, 0] | |
output_sklearn = train_data_sklearn[:, 1] | |
graphlab_size = input_graphlab.size() | |
sklearn_size = len(input_sklearn) | |
print 'Size graphlab:', graphlab_size | |
print 'Size sklearn:', sklearn_size | |
max_size = graphlab_size if graphlab_size > sklearn_size else sklearn_size | |
min_size = graphlab_size if graphlab_size < sklearn_size else sklearn_size | |
diff = 0 | |
for i in range(min_size): | |
if input_graphlab[i] != input_sklearn[i] or output_graphlab[i] != output_sklearn[i]: | |
diff += 1 | |
print 'Number of different rows: {}/{}'.format(diff, max_size) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment