Created
January 28, 2019 22:08
-
-
Save voltek62/1f3eb995d443f1d515835797af821bd1 to your computer and use it in GitHub Desktop.
Revisions
-
pshapiro created this gist
Jan 21, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,240 @@ { "cells": [ { "cell_type": "code", "execution_count": 148, "metadata": { "collapsed": true }, "outputs": [], "source": [ "from mozscape import Mozscape\n", "import pandas as pd\n", "import numpy as np\n", "import requests\n", "import json\n", "import time\n", "\n", "def divide_chunks(l, n): \n", " for i in range(0, len(l), n): \n", " yield l[i:i + n] \n", "\n", "client = Mozscape('my_access_id', 'my_secret_key')" ] }, { "cell_type": "code", "execution_count": 149, "metadata": { "collapsed": true }, "outputs": [], "source": [ "csv = pd.read_csv('./all_outlinks.csv', skiprows=1)\n", "\n", "links = csv[csv['Type'] == 'AHREF']\n", "links = csv[~csv['Destination'].str.match('https?://boardgamegeek.com/.*|https?://rpggeek.com/.*|https?://boardgamegeekstore.com/.*|https?://.*.\\.geekdo-.*.com/.*|https?://videogamegeek.com/.*|https?://.*\\.amazon-.*.com.*')]\n", "\n", "Domains = links['Destination'].replace(to_replace=\"(.*://)?([^/?]+).*\", value=r\"\\1\\2\", regex=True)\n", "\n", "x = list(divide_chunks(Domains.unique().tolist(), 5)) \n", "\n", "df = pd.DataFrame(columns=['pda','upa','url','status'])" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'upa': 20, 'pda': 13, 'url': 'http://www.qmlogistics.com', 'status': 403}\n", "{'upa': 100, 'pda': 100, 'url': 'https://www.youtube.com', 'status': 200}\n", "{'upa': 37, 'pda': 73, 'url': 'https://moedaseco.lojaintegrada.com.br', 'status': 200}\n", "{'upa': 22, 'pda': 19, 'url': 'https://www.eggertspiele.com', 'status': 503}\n", "{'upa': 80, 'pda': 94, 'url': 'https://www.amazon.co.uk', 'status': 503}\n", "{'upa': 29, 'pda': 23, 'url': 'https://boardgameprices.co.uk', 'status': 200}\n", "{'upa': 22, 'pda': 22, 'url': 'http://firestormcards.co.uk', 'status': 200}\n", "{'upa': 65, 'pda': 83, 'url': 'http://www.boardgamegeek.com', 'status': 200}\n", "{'upa': 56, 'pda': 68, 'url': 'https://challonge.com', 'status': 200}\n", "{'upa': 30, 'pda': 28, 'url': 'https://www.gamenerdz.com', 'status': -1}\n", "{'upa': 40, 'pda': 36, 'url': 'https://www.thebrokentoken.com', 'status': 200}\n", "{'upa': 50, 'pda': 49, 'url': 'https://www.plaidhatgames.com', 'status': 200}\n", "{'upa': 1, 'pda': 0, 'url': 'http://www.moedaseco.com.br', 'status': 200}\n", "{'upa': 41, 'pda': 42, 'url': 'https://www.maydaygames.com', 'status': 200}\n", "{'upa': 37, 'pda': 35, 'url': 'http://www.summoner.nl', 'status': 200}\n", "{'upa': 66, 'pda': 94, 'url': 'https://cdn.shopify.com', 'status': 403}\n", "{'upa': 59, 'pda': 71, 'url': 'https://www.fantasyflightgames.com', 'status': 200}\n", "{'upa': 59, 'pda': 92, 'url': 'https://media.giphy.com', 'status': 403}\n", "{'upa': 63, 'pda': 76, 'url': 'https://memegenerator.net', 'status': 200}\n", "{'upa': 32, 'pda': 30, 'url': 'https://www.planbgames.com', 'status': 200}\n", "{'upa': 42, 'pda': 37, 'url': 'https://strongholdgames.com', 'status': 200}\n", "{'upa': 52, 'pda': 58, 'url': 'https://www.yourlogicalfallacyis.com', 'status': 200}\n", "{'upa': 36, 'pda': 33, 'url': 'http://www.bordspelmania.eu', 'status': 200}\n", "{'upa': 29, 'pda': 28, 'url': 'http://bordspeler.nl', 'status': 200}\n", "{'upa': 100, 'pda': 100, 'url': 'https://twitter.com', 'status': 200}\n", "{'upa': 88, 'pda': 97, 'url': 'https://en.wikipedia.org', 'status': 200}\n", "{'upa': 52, 'pda': 54, 'url': 'http://www.coolstuffinc.com', 'status': 403}\n", "{'upa': 65, 'pda': 92, 'url': 'https://i.ytimg.com', 'status': 404}\n", "{'upa': 91, 'pda': 97, 'url': 'https://www.amazon.com', 'status': 503}\n", "{'upa': 71, 'pda': 91, 'url': 'https://www.amazon.ca', 'status': 200}\n", "{'upa': 27, 'pda': 26, 'url': 'http://www.apttogame.com', 'status': 200}\n", "{'upa': 40, 'pda': 34, 'url': 'http://www.eggertspiele.de', 'status': 200}\n", "{'upa': 65, 'pda': 93, 'url': 'https://s-media-cache-ak0.pinimg.com', 'status': 403}\n", "{'upa': 51, 'pda': 70, 'url': 'https://tshaonline.org', 'status': 200}\n", "{'upa': 85, 'pda': 95, 'url': 'https://www.etsy.com', 'status': 200}\n", "{'upa': 24, 'pda': 19, 'url': 'https://boardgameinnovation.com', 'status': 200}\n", "{'upa': 37, 'pda': 36, 'url': 'http://www.boardgamebliss.com', 'status': 200}\n", "{'upa': 37, 'pda': 42, 'url': 'http://frpgames.com', 'status': 200}\n", "{'upa': 49, 'pda': 52, 'url': 'http://www.philibertnet.com', 'status': 200}\n", "{'upa': 39, 'pda': 34, 'url': 'http://www.thirstymeeples.co.uk', 'status': 200}\n", "{'upa': 53, 'pda': 57, 'url': 'http://www.artscow.com', 'status': 200}\n", "{'upa': 81, 'pda': 97, 'url': 'https://itunes.apple.com', 'status': 200}\n", "{'upa': 6, 'pda': 7, 'url': 'http://boardgames.bplaced.net', 'status': 200}\n", "{'upa': 51, 'pda': 95, 'url': 'https://opinionatedgamers.files.wordpress.com', 'status': 200}\n", "{'upa': 7, 'pda': 9, 'url': 'http://eggertspiele.bplaced.net', 'status': 403}\n", "{'upa': 37, 'pda': 37, 'url': 'http://www.strongholdgames.com', 'status': 200}\n", "{'upa': 62, 'pda': 93, 'url': 'https://i.pinimg.com', 'status': 403}\n", "{'upa': 26, 'pda': 20, 'url': 'http://www.athenagames.com', 'status': 200}\n", "{'upa': 28, 'pda': 23, 'url': 'http://boardgamesinsider.com', 'status': 200}\n", "{'upa': 36, 'pda': 33, 'url': 'http://store.401games.ca', 'status': 200}\n", "{'upa': 41, 'pda': 46, 'url': 'http://www.boardgamequest.com', 'status': 200}\n", "{'upa': 32, 'pda': 34, 'url': 'http://brettspielbox.de', 'status': 200}\n", "{'upa': 25, 'pda': 25, 'url': 'http://www.brettspiel-news.de', 'status': 200}\n", "{'upa': 68, 'pda': 92, 'url': 'https://pbs.twimg.com', 'status': 400}\n", "{'upa': 25, 'pda': 36, 'url': 'https://www.cpforbes.net', 'status': 200}\n", "{'upa': 85, 'pda': 97, 'url': 'http://goo.gl', 'status': 200}\n", "{'upa': 65, 'pda': 83, 'url': 'https://www.boardgamegeek.com', 'status': 200}\n", "{'upa': 22, 'pda': 17, 'url': 'http://www.argfx.at', 'status': 200}\n", "{'upa': 43, 'pda': 42, 'url': 'https://www.blend4web.com', 'status': 200}\n", "{'upa': 37, 'pda': 32, 'url': 'http://www.plato-magazine.com', 'status': 200}\n", "{'upa': 35, 'pda': 37, 'url': 'http://www.vindjeu.eu', 'status': 200}\n", "{'upa': 52, 'pda': 54, 'url': 'https://www.coolstuffinc.com', 'status': 403}\n", "{'upa': 41, 'pda': 37, 'url': 'http://www.cardhaus.com', 'status': 200}\n", "{'upa': 53, 'pda': 56, 'url': 'http://www.webhallen.com', 'status': 200}\n", "{'upa': 48, 'pda': 53, 'url': 'http://www.cowcow.com', 'status': 200}\n", "{'upa': 67, 'pda': 80, 'url': 'https://www.rotary.org', 'status': 200}\n", "{'upa': 15, 'pda': 12, 'url': 'http://controlledareagaming.com', 'status': 200}\n", "{'upa': 75, 'pda': 92, 'url': 'https://www.twitch.tv', 'status': 200}\n", "{'upa': 78, 'pda': 93, 'url': 'https://www.amazon.de', 'status': 503}\n", "{'upa': 68, 'pda': 83, 'url': 'http://www.thingiverse.com', 'status': 200}\n", "{'upa': 23, 'pda': 19, 'url': 'http://www.boardgameinnovation.com', 'status': 200}\n", "{'upa': 67, 'pda': 95, 'url': 'https://m.imgur.com', 'status': 200}\n", "{'upa': 82, 'pda': 96, 'url': 'https://play.google.com', 'status': 200}\n", "{'upa': 1, 'pda': 0, 'url': 'http://concordiascore.azurewebsites.net', 'status': -1}\n", "{'upa': 48, 'pda': 95, 'url': 'https://thevirginiantv.files.wordpress.com', 'status': 200}\n", "{'upa': 51, 'pda': 50, 'url': 'http://www.miniaturemarket.com', 'status': 200}\n", "{'upa': 31, 'pda': 28, 'url': 'http://www.greatboardgames.ca', 'status': 200}\n", "{'upa': 89, 'pda': 98, 'url': 'https://www.reddit.com', 'status': 429}\n", "{'upa': 51, 'pda': 52, 'url': 'http://www.pegasus.de', 'status': 200}\n", "{'upa': 30, 'pda': 25, 'url': 'https://www.topshelfgamer.com', 'status': 200}\n", "{'upa': 19, 'pda': 16, 'url': 'http://fatcatgaming.co.uk', 'status': 200}\n", "{'upa': 40, 'pda': 36, 'url': 'http://www.thebrokentoken.com', 'status': 200}\n", "{'upa': 41, 'pda': 38, 'url': 'http://www.meeplesource.com', 'status': 200}\n", "{'upa': 79, 'pda': 93, 'url': 'https://www.kickstarter.com', 'status': 200}\n", "{'upa': 43, 'pda': 39, 'url': 'http://www.eaglegames.net', 'status': 200}\n", "{'upa': 81, 'pda': 96, 'url': 'https://youtu.be', 'status': 200}\n", "{'upa': 37, 'pda': 36, 'url': 'https://www.boardgamebliss.com', 'status': 200}\n", "{'upa': 45, 'pda': 44, 'url': 'http://1856.com', 'status': 403}\n", "{'upa': 23, 'pda': 18, 'url': 'http://www.unhalfbricking.com', 'status': 200}\n", "{'upa': 27, 'pda': 18, 'url': 'http://www.boardgamesearch.com.au', 'status': 200}\n", "{'upa': 46, 'pda': 74, 'url': 'https://m.media-amazon.com', 'status': 400}\n", "{'upa': 58, 'pda': 91, 'url': 'https://images-na.ssl-images-amazon.com', 'status': 400}\n", "{'upa': 30, 'pda': 34, 'url': 'http://eggertspiele.de', 'status': 200}\n", "{'upa': 82, 'pda': 95, 'url': 'https://imgur.com', 'status': 200}\n" ] } ], "source": [ "for vals in x:\n", " da_pa = client.urlMetrics(vals, Mozscape.UMCols.domainAuthority | Mozscape.UMCols.pageAuthority)\n", " i = 0\n", " for y in da_pa:\n", " y['url'] = vals[i]\n", " try:\n", " r = requests.get(vals[i])\n", " y['status'] = r.status_code\n", " except requests.exceptions.ConnectionError:\n", " y['status'] = -1\n", " i = i+1\n", " df = df.append(y, ignore_index=True)\n", " print(y) \n", " time.sleep(5)" ] }, { "cell_type": "code", "execution_count": 151, "metadata": { "collapsed": true }, "outputs": [], "source": [ "status_code_threshold = 400\n", "da_threshold = 25\n", "error_urls = df[(df['status'] >= status_code_threshold) & (df['pda'] >= da_threshold)]['url'].tolist()" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "amazon.co.uk status: UNAVAILABLE\n", "shopify.com status: UNAVAILABLE\n", "giphy.com status: UNAVAILABLE\n", "coolstuffinc.com status: UNAVAILABLE\n", "ytimg.com status: UNAVAILABLE\n", "amazon.com status: UNAVAILABLE\n", "pinimg.com status: UNAVAILABLE\n", "pinimg.com status: UNAVAILABLE\n", "twimg.com status: UNAVAILABLE\n", "coolstuffinc.com status: UNAVAILABLE\n", "amazon.de status: UNAVAILABLE\n", "reddit.com status: UNAVAILABLE\n", "1856.com status: UNAVAILABLE\n", "media-amazon.com status: UNAVAILABLE\n", "ssl-images-amazon.com status: UNAVAILABLE\n" ] } ], "source": [ "whois_api_key = \"foo\"\n", "\n", "for x in error_urls:\n", " dnsapi = \"https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=\" + whois_api_key + \"&outputFormat=JSON&cmd=GET_DN_AVAILABILITY&domainName=\" + x\n", " r = requests.get(dnsapi) \n", " parsed_json = json.loads(r.text)\n", " print(parsed_json['DomainInfo']['domainName'] + \" status: \" + parsed_json['DomainInfo']['domainAvailability'])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.2" } }, "nbformat": 4, "nbformat_minor": 2 }