Created
January 29, 2019 21:07
-
-
Save pshapiro/a86dc340f57c38fc22d0545ddec1fc9e to your computer and use it in GitHub Desktop.
Jupyter Notebook that input outlink from Screaming Frog crawl, grabs PA & DA from Moz API, and uses WHOIS API to determine domain availability.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Expired Domain Finder" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Change the `client` variable to include your Moz API *Access ID* and *Secret Key*. You'll need access to the Moz API." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"from mozscape import Mozscape\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import requests\n", | |
"import json\n", | |
"import time\n", | |
"\n", | |
"def divide_chunks(l, n): \n", | |
" for i in range(0, len(l), n): \n", | |
" yield l[i:i + n] \n", | |
" \n", | |
"client = Mozscape('my_access_id', 'my_secret_key')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"The `csv` variable is equal to a an *All Outlinks* report from [Screaming Frog](https://www.screamingfrog.co.uk/seo-spider/)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"csv = pd.read_csv('./all_outlinks.csv', skiprows=1)\n", | |
"\n", | |
"links = csv[csv['Type'] == 'AHREF']\n", | |
"links = csv[~csv['Destination'].str.match('https?://boardgamegeek.com/.*|https?://rpggeek.com/.*|https?://boardgamegeekstore.com/.*|https?://.*.\\.geekdo-.*.com/.*|https?://videogamegeek.com/.*|https?://.*\\.amazon-.*.com.*')]\n", | |
"\n", | |
"Domains = links['Destination'].replace(to_replace=\"(.*://)?([^/?]+).*\", value=r\"\\1\\2\", regex=True)\n", | |
"\n", | |
"x = list(divide_chunks(Domains.unique().tolist(), 5)) \n", | |
"\n", | |
"df = pd.DataFrame(columns=['pda','upa','url','status'])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"`headers` is set up spoof the Googlebot user agent to avoid the servers from blocking the status code checks. It is sleeping for 5 seconds for every 5 domains checked with the Moz API." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"{'upa': 20, 'pda': 13, 'url': 'http://www.qmlogistics.com', 'status': 200}\n", | |
"{'upa': 100, 'pda': 100, 'url': 'https://www.youtube.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 73, 'url': 'https://moedaseco.lojaintegrada.com.br', 'status': 200}\n", | |
"{'upa': 22, 'pda': 19, 'url': 'https://www.eggertspiele.com', 'status': 200}\n", | |
"{'upa': 80, 'pda': 94, 'url': 'https://www.amazon.co.uk', 'status': 200}\n", | |
"{'upa': 30, 'pda': 23, 'url': 'https://boardgameprices.co.uk', 'status': 200}\n", | |
"{'upa': 22, 'pda': 22, 'url': 'http://firestormcards.co.uk', 'status': 200}\n", | |
"{'upa': 65, 'pda': 83, 'url': 'http://www.boardgamegeek.com', 'status': 200}\n", | |
"{'upa': 56, 'pda': 68, 'url': 'https://challonge.com', 'status': 403}\n", | |
"{'upa': 31, 'pda': 28, 'url': 'https://www.gamenerdz.com', 'status': 200}\n", | |
"{'upa': 40, 'pda': 36, 'url': 'https://www.thebrokentoken.com', 'status': 200}\n", | |
"{'upa': 50, 'pda': 49, 'url': 'https://www.plaidhatgames.com', 'status': 200}\n", | |
"{'upa': 1, 'pda': 0, 'url': 'http://www.moedaseco.com.br', 'status': 200}\n", | |
"{'upa': 41, 'pda': 43, 'url': 'https://www.maydaygames.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 35, 'url': 'http://www.summoner.nl', 'status': 200}\n", | |
"{'upa': 66, 'pda': 94, 'url': 'https://cdn.shopify.com', 'status': 403}\n", | |
"{'upa': 59, 'pda': 71, 'url': 'https://www.fantasyflightgames.com', 'status': 403}\n", | |
"{'upa': 59, 'pda': 92, 'url': 'https://media.giphy.com', 'status': 403}\n", | |
"{'upa': 63, 'pda': 76, 'url': 'https://memegenerator.net', 'status': 200}\n", | |
"{'upa': 32, 'pda': 30, 'url': 'https://www.planbgames.com', 'status': 200}\n", | |
"{'upa': 42, 'pda': 37, 'url': 'https://strongholdgames.com', 'status': 200}\n", | |
"{'upa': 52, 'pda': 58, 'url': 'https://www.yourlogicalfallacyis.com', 'status': 200}\n", | |
"{'upa': 36, 'pda': 33, 'url': 'http://www.bordspelmania.eu', 'status': 200}\n", | |
"{'upa': 30, 'pda': 28, 'url': 'http://bordspeler.nl', 'status': 200}\n", | |
"{'upa': 100, 'pda': 100, 'url': 'https://twitter.com', 'status': 200}\n", | |
"{'upa': 88, 'pda': 97, 'url': 'https://en.wikipedia.org', 'status': 200}\n", | |
"{'upa': 52, 'pda': 54, 'url': 'http://www.coolstuffinc.com', 'status': 200}\n", | |
"{'upa': 65, 'pda': 92, 'url': 'https://i.ytimg.com', 'status': 404}\n", | |
"{'upa': 91, 'pda': 97, 'url': 'https://www.amazon.com', 'status': 200}\n", | |
"{'upa': 71, 'pda': 91, 'url': 'https://www.amazon.ca', 'status': 200}\n", | |
"{'upa': 27, 'pda': 26, 'url': 'http://www.apttogame.com', 'status': 200}\n", | |
"{'upa': 40, 'pda': 34, 'url': 'http://www.eggertspiele.de', 'status': 200}\n", | |
"{'upa': 65, 'pda': 93, 'url': 'https://s-media-cache-ak0.pinimg.com', 'status': 403}\n", | |
"{'upa': 51, 'pda': 70, 'url': 'https://tshaonline.org', 'status': 200}\n", | |
"{'upa': 85, 'pda': 95, 'url': 'https://www.etsy.com', 'status': 200}\n", | |
"{'upa': 24, 'pda': 20, 'url': 'https://boardgameinnovation.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 36, 'url': 'http://www.boardgamebliss.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 42, 'url': 'http://frpgames.com', 'status': 200}\n", | |
"{'upa': 50, 'pda': 52, 'url': 'http://www.philibertnet.com', 'status': 200}\n", | |
"{'upa': 39, 'pda': 34, 'url': 'http://www.thirstymeeples.co.uk', 'status': 200}\n", | |
"{'upa': 54, 'pda': 57, 'url': 'http://www.artscow.com', 'status': 200}\n", | |
"{'upa': 81, 'pda': 97, 'url': 'https://itunes.apple.com', 'status': 200}\n", | |
"{'upa': 6, 'pda': 7, 'url': 'http://boardgames.bplaced.net', 'status': 200}\n", | |
"{'upa': 51, 'pda': 95, 'url': 'https://opinionatedgamers.files.wordpress.com', 'status': 200}\n", | |
"{'upa': 7, 'pda': 9, 'url': 'http://eggertspiele.bplaced.net', 'status': 403}\n", | |
"{'upa': 37, 'pda': 37, 'url': 'http://www.strongholdgames.com', 'status': 200}\n", | |
"{'upa': 62, 'pda': 93, 'url': 'https://i.pinimg.com', 'status': 403}\n", | |
"{'upa': 26, 'pda': 20, 'url': 'http://www.athenagames.com', 'status': 200}\n", | |
"{'upa': 28, 'pda': 23, 'url': 'http://boardgamesinsider.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 33, 'url': 'http://store.401games.ca', 'status': 200}\n", | |
"{'upa': 41, 'pda': 46, 'url': 'http://www.boardgamequest.com', 'status': 200}\n", | |
"{'upa': 33, 'pda': 35, 'url': 'http://brettspielbox.de', 'status': 200}\n", | |
"{'upa': 25, 'pda': 26, 'url': 'http://www.brettspiel-news.de', 'status': 200}\n", | |
"{'upa': 68, 'pda': 92, 'url': 'https://pbs.twimg.com', 'status': 400}\n", | |
"{'upa': 25, 'pda': 36, 'url': 'https://www.cpforbes.net', 'status': 403}\n", | |
"{'upa': 85, 'pda': 97, 'url': 'http://goo.gl', 'status': 200}\n", | |
"{'upa': 65, 'pda': 83, 'url': 'https://www.boardgamegeek.com', 'status': 200}\n", | |
"{'upa': 22, 'pda': 17, 'url': 'http://www.argfx.at', 'status': 200}\n", | |
"{'upa': 43, 'pda': 42, 'url': 'https://www.blend4web.com', 'status': 200}\n", | |
"{'upa': 37, 'pda': 32, 'url': 'http://www.plato-magazine.com', 'status': 200}\n", | |
"{'upa': 35, 'pda': 37, 'url': 'http://www.vindjeu.eu', 'status': 200}\n", | |
"{'upa': 52, 'pda': 54, 'url': 'https://www.coolstuffinc.com', 'status': 200}\n", | |
"{'upa': 41, 'pda': 37, 'url': 'http://www.cardhaus.com', 'status': 200}\n", | |
"{'upa': 53, 'pda': 56, 'url': 'http://www.webhallen.com', 'status': 200}\n", | |
"{'upa': 48, 'pda': 53, 'url': 'http://www.cowcow.com', 'status': 200}\n", | |
"{'upa': 67, 'pda': 80, 'url': 'https://www.rotary.org', 'status': 200}\n", | |
"{'upa': 15, 'pda': 12, 'url': 'http://controlledareagaming.com', 'status': 200}\n", | |
"{'upa': 75, 'pda': 92, 'url': 'https://www.twitch.tv', 'status': 200}\n", | |
"{'upa': 78, 'pda': 93, 'url': 'https://www.amazon.de', 'status': 200}\n", | |
"{'upa': 68, 'pda': 83, 'url': 'http://www.thingiverse.com', 'status': 200}\n", | |
"{'upa': 23, 'pda': 20, 'url': 'http://www.boardgameinnovation.com', 'status': 200}\n", | |
"{'upa': 67, 'pda': 95, 'url': 'https://m.imgur.com', 'status': 200}\n", | |
"{'upa': 82, 'pda': 96, 'url': 'https://play.google.com', 'status': 200}\n", | |
"{'upa': 1, 'pda': 0, 'url': 'http://concordiascore.azurewebsites.net', 'status': -1}\n", | |
"{'upa': 48, 'pda': 95, 'url': 'https://thevirginiantv.files.wordpress.com', 'status': 200}\n", | |
"{'upa': 51, 'pda': 50, 'url': 'http://www.miniaturemarket.com', 'status': 200}\n", | |
"{'upa': 31, 'pda': 28, 'url': 'http://www.greatboardgames.ca', 'status': 200}\n", | |
"{'upa': 89, 'pda': 98, 'url': 'https://www.reddit.com', 'status': 429}\n", | |
"{'upa': 51, 'pda': 52, 'url': 'http://www.pegasus.de', 'status': 200}\n", | |
"{'upa': 30, 'pda': 25, 'url': 'https://www.topshelfgamer.com', 'status': 200}\n", | |
"{'upa': 19, 'pda': 16, 'url': 'http://fatcatgaming.co.uk', 'status': 200}\n", | |
"{'upa': 40, 'pda': 36, 'url': 'http://www.thebrokentoken.com', 'status': 200}\n", | |
"{'upa': 41, 'pda': 38, 'url': 'http://www.meeplesource.com', 'status': 200}\n", | |
"{'upa': 79, 'pda': 93, 'url': 'https://www.kickstarter.com', 'status': 200}\n", | |
"{'upa': 43, 'pda': 39, 'url': 'http://www.eaglegames.net', 'status': 200}\n", | |
"{'upa': 81, 'pda': 96, 'url': 'https://youtu.be', 'status': 200}\n", | |
"{'upa': 37, 'pda': 36, 'url': 'https://www.boardgamebliss.com', 'status': 200}\n", | |
"{'upa': 45, 'pda': 44, 'url': 'http://1856.com', 'status': 200}\n", | |
"{'upa': 23, 'pda': 18, 'url': 'http://www.unhalfbricking.com', 'status': -1}\n", | |
"{'upa': 27, 'pda': 18, 'url': 'http://www.boardgamesearch.com.au', 'status': 200}\n", | |
"{'upa': 46, 'pda': 74, 'url': 'https://m.media-amazon.com', 'status': 400}\n", | |
"{'upa': 58, 'pda': 91, 'url': 'https://images-na.ssl-images-amazon.com', 'status': 400}\n", | |
"{'upa': 30, 'pda': 34, 'url': 'http://eggertspiele.de', 'status': 200}\n", | |
"{'upa': 82, 'pda': 95, 'url': 'https://imgur.com', 'status': 200}\n" | |
] | |
} | |
], | |
"source": [ | |
"headers = {'user-agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'}\n", | |
"\n", | |
"for vals in x:\n", | |
" da_pa = client.urlMetrics(vals, Mozscape.UMCols.domainAuthority | Mozscape.UMCols.pageAuthority)\n", | |
" i = 0\n", | |
" for y in da_pa:\n", | |
" y['url'] = vals[i]\n", | |
" try:\n", | |
" r = requests.get(vals[i], headers=headers)\n", | |
" y['status'] = r.status_code\n", | |
" except requests.exceptions.ConnectionError:\n", | |
" y['status'] = -1\n", | |
" i = i+1\n", | |
" df = df.append(y, ignore_index=True)\n", | |
" print(y) \n", | |
" time.sleep(5)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Looking at every domain that shows a status code >= 400 with the `status_code_threshold` variable." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"status_code_threshold = 400\n", | |
"da_threshold = 25\n", | |
"error_urls = df[(df['status'] >= status_code_threshold) & (df['pda'] >= da_threshold)]['url'].tolist()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Change the `whois_api_key` variable to be equal to the [Whois XML API](https://main.whoisxmlapi.com/) API key. 500 credits are available for free." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"amazon.co.uk status: UNAVAILABLE\n", | |
"shopify.com status: UNAVAILABLE\n", | |
"giphy.com status: UNAVAILABLE\n", | |
"coolstuffinc.com status: UNAVAILABLE\n", | |
"ytimg.com status: UNAVAILABLE\n", | |
"amazon.com status: UNAVAILABLE\n", | |
"pinimg.com status: AVAILABLE\n", | |
"pinimg.com status: AVAILABLE\n", | |
"twimg.com status: UNAVAILABLE\n", | |
"coolstuffinc.com status: UNAVAILABLE\n", | |
"challonge.com status: AVAILABLE\n", | |
"shopify.com status: UNAVAILABLE\n", | |
"fantasyflightgames.com status: AVAILABLE\n", | |
"giphy.com status: UNAVAILABLE\n", | |
"ytimg.com status: UNAVAILABLE\n", | |
"pinimg.com status: AVAILABLE\n", | |
"pinimg.com status: UNAVAILABLE\n", | |
"twimg.com status: UNAVAILABLE\n", | |
"cpforbes.net status: UNAVAILABLE\n", | |
"reddit.com status: UNAVAILABLE\n", | |
"media-amazon.com status: UNAVAILABLE\n", | |
"ssl-images-amazon.com status: UNAVAILABLE\n" | |
] | |
} | |
], | |
"source": [ | |
"whois_api_key = \"your_key\"\n", | |
"\n", | |
"for x in error_urls:\n", | |
" dnsapi = \"https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=\" + whois_api_key + \"&outputFormat=JSON&cmd=GET_DN_AVAILABILITY&domainName=\" + x\n", | |
" r = requests.get(dnsapi) \n", | |
" parsed_json = json.loads(r.text)\n", | |
" print(parsed_json['DomainInfo']['domainName'] + \" status: \" + parsed_json['DomainInfo']['domainAvailability'])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment