Created
May 13, 2018 14:11
-
-
Save HelloWorld017/0b52b4486247c2802e9d314c06ca8b5a to your computer and use it in GitHub Desktop.
Using SVM to classify houses in seoul and daejeon
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
import json | |
import os | |
# From https://gist.github.com/giftbott/6ab91e81a8e5ff67c631fcc7c97f1483 | |
class ZigbangDl: | |
def __init__(self): | |
self.rooms = [] | |
def crawl_item(self, zigbang_url): | |
response = requests.get(zigbang_url) | |
init_data = json.loads(response.text) | |
rooms_info = init_data.get('items') | |
self.rooms += rooms_info | |
def crawl_items(self, room_id_list): | |
list_len = len(room_id_list) | |
for idx, room_id in enumerate(room_id_list): | |
print("%d / %d" % (idx, list_len)) | |
url = "https://api.zigbang.com/v1/items?detail=true&item_ids=%d&~~~&~~~&~~~" % room_id | |
self.crawl_item(url) | |
def save_info_to_csv(self, path): | |
df = pd.DataFrame( | |
columns=["평 수", "보증금", "월세"] | |
) | |
for idx, item in enumerate(self.rooms): | |
room = item.get('item') | |
df.loc[idx] = [ | |
room['size'], # 평 수 | |
room['deposit'], # 보증금 | |
room['rent'] # 월세 | |
] | |
csv_path = os.path.join(path) | |
df.to_csv(csv_path) | |
zigbang_list = json.load(open("./zigbang_integrated.json")) | |
zigbang_seoul = zigbang_list['seoul'] | |
zigbang_daejeon = zigbang_list['daejeon'] | |
print("Crawling Seoul...") | |
crawler = ZigbangDl() | |
crawler.crawl_items(zigbang_seoul) | |
crawler.save_info_to_csv('./zigbang_seoul.csv') | |
print("Crawling Daejeon...") | |
crawler = ZigbangDl() | |
crawler.crawl_items(zigbang_daejeon) | |
crawler.save_info_to_csv('./zigbang_daejeon.csv') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import requests | |
import json | |
import os | |
class ZigbangDl: | |
def __init__(self): | |
self.rooms = [] | |
def crawl_item(self, zigbang_url): | |
response = requests.get(zigbang_url) | |
init_data = json.loads(response.text) | |
rooms_info = init_data.get('items') | |
self.rooms += rooms_info | |
def crawl_items(self, room_id_list): | |
list_len = len(room_id_list) | |
for idx, room_id in enumerate(room_id_list): | |
print("%d / %d" % (idx, list_len)) | |
url = "https://api.zigbang.com/v1/items?detail=true&item_ids=%d&~~~&~~~&~~~" % room_id | |
self.crawl_item(url) | |
def save_info_to_csv(self, path): | |
df = pd.DataFrame( | |
columns=["평 수", "보증금", "월세"] | |
) | |
for idx, item in enumerate(self.rooms): | |
room = item.get('item') | |
df.loc[idx] = [ | |
room['size'], # 평 수 | |
room['deposit'], # 보증금 | |
room['rent'] # 월세 | |
] | |
csv_path = os.path.join(path) | |
df.to_csv(csv_path) | |
zigbang_list = json.load(open("./zigbang_integrated.json")) | |
zigbang_seoul = zigbang_list['seoul'] | |
zigbang_daejeon = zigbang_list['daejeon'] | |
print("Crawling Seoul...") | |
crawler = ZigbangDl() | |
crawler.crawl_items(zigbang_seoul) | |
crawler.save_info_to_csv('./zigbang_seoul.csv') | |
print("Crawling Daejeon...") | |
crawler = ZigbangDl() | |
crawler.crawl_items(zigbang_daejeon) | |
crawler.save_info_to_csv('./zigbang_daejeon.csv') | |
import csv | |
from sklearn.svm import SVC | |
import matplotlib.pyplot as plt | |
# noinspection PyUnresolvedReferences | |
from mpl_toolkits.mplot3d import Axes3D | |
import numpy as np | |
import random | |
x_data = [] | |
y_data = [0] * 100 + [1] * 100 | |
def read_csv(file): | |
with open(file) as csvfile: | |
reader = csv.reader(csvfile) | |
next(reader) | |
for x in reader: | |
x_data.append([float(data) for data in x][1:]) | |
# noinspection PyUnresolvedReferences | |
def visualize(): | |
fig = plt.figure() | |
ax = fig.add_subplot(111, projection='3d') | |
for idx, x_datum in enumerate(x_data): | |
marker = 'x' if y_data[idx] == 0 else 'o' | |
ax.scatter(x_datum[0], x_datum[1], x_datum[2], marker=marker) | |
plane_x, plane_y = np.meshgrid(np.linspace(0, 30, 50), np.linspace(0, 30000, 50)) | |
def plane_z(x, y): | |
return (- svm.intercept_[0] - svm.coef_[0][0] * x - svm.coef_[0][1]) / svm.coef_[0][2] | |
ax.plot_surface(plane_x, plane_y, plane_z(plane_x, plane_y)) | |
ax.set_xlabel('Size') | |
ax.set_ylabel('Deposit') | |
ax.set_zlabel('Rent') | |
plt.show() | |
# Reading CSV data | |
read_csv('./zigbang_daejeon.csv') | |
read_csv('./zigbang_seoul.csv') | |
# Randomly sampling 20 test data | |
x_test = [] | |
y_test = [] | |
for i in range(20): | |
sample_index = random.randrange(len(x_data)) | |
x_test.append(x_data.pop(sample_index)) | |
y_test.append(y_data.pop(sample_index)) | |
y_test = np.array(y_test) | |
# Fitting SVM | |
svm = SVC(kernel='linear') | |
svm.fit(x_data, y_data) | |
# Visualizing | |
visualize() | |
# Checking accuracy | |
correct = 0 | |
predict = np.array(svm.predict(x_test)) | |
accuracy = np.sum(predict == y_test) / 20 * 100 | |
print(predict, y_test) | |
print('Accuracy: ' + str(accuracy) + '%') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment