Skip to content

Instantly share code, notes, and snippets.

@converge
Last active January 3, 2019 16:48
Show Gist options
  • Save converge/a07a700cf4761299f403b73e88317136 to your computer and use it in GitHub Desktop.
Save converge/a07a700cf4761299f403b73e88317136 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import re
import requests
import pandas as pd
from tabulate import tabulate
import os
import csv
f = csv.writer(open('ops.csv', 'w', encoding='utf-8'))
f.writerow(['Name', 'Link'])
BASE_URL = 'https://www.kompetenznetz-mittelstand.de/de/app/account/list/public?s=IwvKhoY3u236pAafN&action=list&page='
pages = []
# for i in range(1, 146):
for i in range(1, 2):
url = BASE_URL + str(i) + '&item_count=200'
pages.append(url)
for item in pages:
page = requests.get(item)
soup = BeautifulSoup(page.text, 'html.parser')
img_tag = soup.img
soup.img.decompose()
img_tag
frist_links = soup.find(class_='sf-nav sf-nav-tabmenu sf-hide-print')
frist_links.decompose()
left_links = soup.find(class_='left')
left_links.decompose()
last_links = soup.find(class_='sf-btn-group left')
last_links.decompose()
last_links2 = soup.find(class_='sf-btn-group')
last_links2.decompose()
company_list = soup.find(class_='sfsDialogContent')
company_item_list = company_list.find_all('a')
for company_name in company_item_list:
names = company_name.contents[0]
links = 'https://www.kompetenznetz-mittelstand.de' + company_name.get('href')
f.writerow([names, links])
writer = csv.writer(open('correct.csv', 'w', encoding='utf-8'))
import ipdb
ipdb.set_trace()
with open('ops.csv') as file:
file_reader = csv.reader(file)
for row in file_reader:
if row[0] is not None:
if not row[0].startswith('<img alt'):
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment