Created
December 15, 2016 10:58
-
-
Save mauromarano/d424b2d76722616f5547fd77a2b891d7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 71, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"#imorting dependencies\n", | |
"import requests\n", | |
"from bs4 import BeautifulSoup\n", | |
"\n", | |
"base_url = 'http://www.subito.it/annunci-italia/vendita/appartamenti/?o='\n", | |
"pagine_da_cercare = 100\n", | |
"\n", | |
"def get_source( url,page=1):\n", | |
" url = url + str(page)\n", | |
" r = requests.get(url)\n", | |
" return r.content\n", | |
"\n", | |
"\n", | |
"#html_sources = get_source(base_url,1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 72, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_data_from_page(source):\n", | |
"\n", | |
" apartments = []\n", | |
" soup = BeautifulSoup(source,\"html.parser\")\n", | |
" for apartment in soup.select('.item_description'):\n", | |
" try:\n", | |
" price = apartment.select('.item_price')[0].string\n", | |
" price = price.split(\" \")[0].strip()\n", | |
" specs = apartment.select('.item_specs')[0].string\n", | |
" mq = specs.split(\"mq\")[0]\n", | |
" mq = mq.split(\"-\")[1].strip()\n", | |
" city = apartment.select('.item_city')[0].string\n", | |
" city = city.split(\"(\")[1]\n", | |
" city = city.split(')')[0].strip()\n", | |
"\n", | |
" apartment = {\n", | |
" \"price\" : price,\n", | |
" \"mq\" : mq,\n", | |
" \"city\" : city\n", | |
" }\n", | |
" apartments.append(apartment)\n", | |
" except:\n", | |
" print \"Some values is missing\"\n", | |
" \n", | |
" return apartments\n", | |
" \n", | |
" \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 73, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n", | |
"Some values is missing\n" | |
] | |
} | |
], | |
"source": [ | |
"#ogni elemento dell'array è un oggetto con i dati dell appartamento\n", | |
"apartaments = []\n", | |
"for i in range(pagine_da_cercare):\n", | |
" sources = get_source(base_url,i)\n", | |
" apartaments.append(get_data_from_page(sources))\n", | |
" \n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 74, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"\n", | |
"#final data\n", | |
"data = []\n", | |
"for apartament in apartaments:\n", | |
" for a in apartament:\n", | |
" city = a['city']\n", | |
" price = a['price']\n", | |
" mq = a['mq']\n", | |
" data.append([city,mq,price])\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Ci sono un totale di 2823 appartamenti \n" | |
] | |
} | |
], | |
"source": [ | |
"#citta, mq\n", | |
"x = []\n", | |
"\n", | |
"#prezzo\n", | |
"y = []\n", | |
"\n", | |
"\n", | |
"for a in data:\n", | |
" x.append([a[0],a[1]])\n", | |
" y.append(a[2])\n", | |
"\n", | |
" \n", | |
"x_citta = []\n", | |
"for citta in x:\n", | |
" x_citta.append(citta[0])\n", | |
"\n", | |
"from sklearn import preprocessing\n", | |
"le = preprocessing.LabelEncoder()\n", | |
"le.fit(x_citta)\n", | |
"citta_transformate = le.transform(x_citta)\n", | |
"#le.inverse_transform([28,46])\n", | |
"#le.transform(['IM'])\n", | |
"\n", | |
"i = 0\n", | |
"for citta_transformata in citta_transformate:\n", | |
" x[i][0] = citta_transformata\n", | |
" i = i +1\n", | |
" \n", | |
"print \"Ci sono un totale di \" + str(len(data)) + \" appartamenti \"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[14]\n", | |
"[u'178.000']\n" | |
] | |
} | |
], | |
"source": [ | |
"from sklearn import tree\n", | |
"\n", | |
"clf = tree.DecisionTreeClassifier()\n", | |
"clf = clf.fit(x,y)\n", | |
"citta = le.transform(['BO'])\n", | |
"print citta\n", | |
"prediction = clf.predict([[citta,\"145\"]])\n", | |
"print prediction" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment