Skip to content

Instantly share code, notes, and snippets.

@jen6
Last active May 14, 2024 13:39
Show Gist options
  • Save jen6/a1fd8a1329b8c1c8b938262376148a23 to your computer and use it in GitHub Desktop.
Save jen6/a1fd8a1329b8c1c8b938262376148a23 to your computer and use it in GitHub Desktop.
직방을 크롤링해 전세자금 대출이 가능한 집 알아보기
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import copy\n",
"import csv\n",
"import time\n",
"import json\n",
"import uuid\n",
"import pprint\n",
"import asyncio\n",
"from typing import List\n",
"from urllib import parse\n",
"from urllib.parse import quote\n",
"import inspect\n",
"import datetime\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class magic_fstring_function:\n",
" def __init__(self, payload):\n",
" self.payload = payload\n",
" self.cached = None\n",
" def __str__(self):\n",
" if self.cached is None:\n",
" vars = inspect.currentframe().f_back.f_globals.copy()\n",
" vars.update(inspect.currentframe().f_back.f_locals)\n",
" self.cached = self.payload.format(**vars)\n",
" return self.cached\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"uuid = str(uuid.uuid4())\n",
"item_list_api = \"https://apis.zigbang.com/v2/items\"\n",
"describe_list_api = item_list_api + '/list'\n",
"item_describe_api = \"https://apis.zigbang.com/v3/items?item_ids={item_id}&detail=true\"\n",
"item_view_url = \"https://zigbang.com/home/oneroom/items/{item_id}\"\n",
"\n",
"referer = \"https://zigbang.com/home/oneroom/subways/414/items\"\n",
" \n",
"headers = {\n",
" 'Host':'apis.zigbang.com',\n",
" 'Connection':'keep-alive',\n",
" 'Pragma':'no-cache',\n",
" 'Cache-Control':'no-cache',\n",
" 'Accept':'application/json, text/plain, */*',\n",
" 'Origin':'https://zigbang.com',\n",
" 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Mobile Safari/537.36',\n",
" 'DNT':'1',\n",
" 'Sec-Fetch-Site':'same-site',\n",
" 'Sec-Fetch-Mode':'cors',\n",
" 'Referer':'https://zigbang.com/home/oneroom/subways/414/items',\n",
" 'Accept-Encoding':'gzip, deflate, br',\n",
" 'Accept-Language':'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',\n",
"}\n",
"\n",
"items = {}\n",
"sad_words = [ '대출x', '대출불가', '전세 안됩']\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"def export_items(items):\n",
" fieldnames = [\n",
" 'id',\n",
" 'title',\n",
" 'description',\n",
" 'deposit',\n",
" 'agent_name',\n",
" 'agent_mobile',\n",
" '_floor',\n",
" 'address1',\n",
" 'address2',\n",
" 'address3',\n",
" 'size',\n",
" 'size_m2',\n",
" 'status',\n",
" 'room_type',\n",
" 'manage_cost_inc',\n",
" 'updated_at2',\n",
" 'url',\n",
" ]\n",
" \n",
" filtered_item = [] \n",
" maximum_updated = datetime.timedelta(days=4)\n",
" \n",
" wish = '대출'\n",
" for item in items:\n",
" item = item['item']\n",
" if wish in item['title'] or wish in item['description']:\n",
" plz_flag = True\n",
" for word in sad_words:\n",
" if word in item['description']:\n",
" plz_flag = False\n",
" break\n",
" if not plz_flag:\n",
" continue\n",
" item_updated = datetime.datetime.strptime(item['updated_at2'], '%Y-%m-%d')\n",
" if maximum_updated < datetime.datetime.now() - item_updated:\n",
" continue\n",
" item_id = item['id']\n",
" url = magic_fstring_function(item_view_url)\n",
" item['url'] = str(url)\n",
" new_item = {}\n",
" for fieldname in fieldnames:\n",
" new_item[fieldname] = item[fieldname]\n",
" filtered_item.append(new_item)\n",
" return filtered_item"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def describe_room_list(items):\n",
" item_ids = list(items.keys())\n",
" max_idx = int(len(item_ids) / 30)\n",
" if len(item_ids) % 30 != 0:\n",
" max_idx += 1\n",
" \n",
" items = []\n",
" headers['Accept'] = 'application/json'\n",
" headers['Referer'] = 'https://zigbang.com/home/oneroom/subways/37/items'\n",
" \n",
" for i in range(max_idx):\n",
" start_idx = i * 30\n",
" end_idx = min((i+1)*30, len(item_ids))\n",
" sub_ids = item_ids[start_idx:end_idx-1]\n",
" item_id = str(sub_ids).replace(' ', '').replace('\\'','')\n",
" api = magic_fstring_function(item_describe_api)\n",
" g = api\n",
" resp = requests.get(\n",
" url=str(api),\n",
" headers=headers,\n",
" timeout=5,\n",
" )\n",
" resp.encoding = 'utf-8'\n",
" result = resp.json()\n",
" resp.close()\n",
" items += result['items']\n",
" time.sleep(1)\n",
" \n",
" return items"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_room_list(\n",
" items,\n",
" deposit_gteq: int = 0,\n",
" deposit_lteq: int = 8000,\n",
" domain: str = \"zigbang\",\n",
" floor_in: str = \"ground\",\n",
" geohash: str = \"wydjr\",\n",
" rent_gteq: int = 0,\n",
" sales_type_in: str = \"전세\",\n",
" service_type_eq: str = \"원룸\",\n",
"):\n",
" resp = requests.get(\n",
" url=item_list_api,\n",
" params={\n",
" \"deposit_gteq\": deposit_gteq,\n",
" \"deposit_lteq\": deposit_lteq,\n",
" \"domain\": domain,\n",
" \"floor_in\": floor_in,\n",
" \"geohash\": geohash,\n",
" \"rent_gteq\": rent_gteq,\n",
" \"sales_type_in\": sales_type_in,\n",
" \"service_type_eq\": service_type_eq,\n",
" },\n",
" headers=headers,\n",
" timeout=5,\n",
" )\n",
" new_items = json.loads(resp.content.decode('utf-8'))\n",
" resp.close()\n",
"\n",
" for section in new_items['sections']:\n",
" for item in section['item_ids']:\n",
" items[str(item)] = {}\n",
"\n",
" return items"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"items = {}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"geohash에 따른 방 정보를 가져오기"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"geos_sinrim = [\n",
" 'wydm0',\n",
" 'wydm1',\n",
" 'wydm2',\n",
" 'wydm3',\n",
"] \n",
"\n",
"for geo in geos_sinrim:\n",
" items = get_room_list(items, geohash=geo)\n",
" time.sleep(1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(items)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"가져 원하는 지역의 item id 를 통해 각각의 방 정보를 가져온다"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result = describe_room_list(items)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pp = pprint.PrettyPrinter(indent=2)\n",
"pp.pprint(result[0]['item'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"필터링 조건을 통해 내가 원하는 방의 정보만 가져온다."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"filtered_result = export_items(result)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(filtered_result)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"여러개로 나뉘어 있는 방 정보를 concat 해준 뒤 각 item에 넣어주고 해당 주소를 기반으로 같은 집을 묶어준다."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_full_address(item):\n",
" addr = item['address1']\n",
" addr += ' '\n",
" addr += item['address2'] or ''\n",
" addr += ' '\n",
" addr += item['address3'] or ''\n",
" return addr"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"geo_based_dict = {}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for item in filtered_result:\n",
" full_addr = get_full_address(item)\n",
" item['address'] = full_addr\n",
" if full_addr not in geo_based_dict:\n",
" geo_based_dict[full_addr] = []\n",
" geo_based_dict[full_addr].append(item)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"first_key = next(iter(geo_based_dict))\n",
"pp.pprint(geo_based_dict[first_key][0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"double_list = []\n",
"for addr in geo_based_dict.keys():\n",
" double_list.append(geo_based_dict[addr][0])\n",
"double_list.sort(key=lambda x: x['size'], reverse=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"원하는 조건 가진 아이템을 csv 형식으로 파일을 만든다."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open('eggs.csv', 'w', newline='') as csvfile:\n",
" fieldnames = [\n",
" 'id',\n",
" 'title',\n",
" 'description',\n",
" 'deposit',\n",
" 'agent_name',\n",
" 'agent_mobile',\n",
" '_floor',\n",
" 'address',\n",
" 'size',\n",
" 'size_m2',\n",
" 'status',\n",
" 'room_type',\n",
" 'manage_cost_inc',\n",
" 'updated_at2',\n",
" 'url',\n",
" ]\n",
" writer = csv.DictWriter(csvfile, extrasaction='ignore', fieldnames=fieldnames)\n",
" writer.writeheader()\n",
" for item in double_list:\n",
" writer.writerow(item)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment