Skip to content

Instantly share code, notes, and snippets.

@voltek62
Created January 28, 2019 22:08
Show Gist options
  • Select an option

  • Save voltek62/1f3eb995d443f1d515835797af821bd1 to your computer and use it in GitHub Desktop.

Select an option

Save voltek62/1f3eb995d443f1d515835797af821bd1 to your computer and use it in GitHub Desktop.

Revisions

  1. @pshapiro pshapiro created this gist Jan 21, 2019.
    240 changes: 240 additions & 0 deletions expired_domain_finder.ipynb
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,240 @@
    {
    "cells": [
    {
    "cell_type": "code",
    "execution_count": 148,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": [
    "from mozscape import Mozscape\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import requests\n",
    "import json\n",
    "import time\n",
    "\n",
    "def divide_chunks(l, n): \n",
    " for i in range(0, len(l), n): \n",
    " yield l[i:i + n] \n",
    "\n",
    "client = Mozscape('my_access_id', 'my_secret_key')"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 149,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": [
    "csv = pd.read_csv('./all_outlinks.csv', skiprows=1)\n",
    "\n",
    "links = csv[csv['Type'] == 'AHREF']\n",
    "links = csv[~csv['Destination'].str.match('https?://boardgamegeek.com/.*|https?://rpggeek.com/.*|https?://boardgamegeekstore.com/.*|https?://.*.\\.geekdo-.*.com/.*|https?://videogamegeek.com/.*|https?://.*\\.amazon-.*.com.*')]\n",
    "\n",
    "Domains = links['Destination'].replace(to_replace=\"(.*://)?([^/?]+).*\", value=r\"\\1\\2\", regex=True)\n",
    "\n",
    "x = list(divide_chunks(Domains.unique().tolist(), 5)) \n",
    "\n",
    "df = pd.DataFrame(columns=['pda','upa','url','status'])"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 150,
    "metadata": {},
    "outputs": [
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "{'upa': 20, 'pda': 13, 'url': 'http://www.qmlogistics.com', 'status': 403}\n",
    "{'upa': 100, 'pda': 100, 'url': 'https://www.youtube.com', 'status': 200}\n",
    "{'upa': 37, 'pda': 73, 'url': 'https://moedaseco.lojaintegrada.com.br', 'status': 200}\n",
    "{'upa': 22, 'pda': 19, 'url': 'https://www.eggertspiele.com', 'status': 503}\n",
    "{'upa': 80, 'pda': 94, 'url': 'https://www.amazon.co.uk', 'status': 503}\n",
    "{'upa': 29, 'pda': 23, 'url': 'https://boardgameprices.co.uk', 'status': 200}\n",
    "{'upa': 22, 'pda': 22, 'url': 'http://firestormcards.co.uk', 'status': 200}\n",
    "{'upa': 65, 'pda': 83, 'url': 'http://www.boardgamegeek.com', 'status': 200}\n",
    "{'upa': 56, 'pda': 68, 'url': 'https://challonge.com', 'status': 200}\n",
    "{'upa': 30, 'pda': 28, 'url': 'https://www.gamenerdz.com', 'status': -1}\n",
    "{'upa': 40, 'pda': 36, 'url': 'https://www.thebrokentoken.com', 'status': 200}\n",
    "{'upa': 50, 'pda': 49, 'url': 'https://www.plaidhatgames.com', 'status': 200}\n",
    "{'upa': 1, 'pda': 0, 'url': 'http://www.moedaseco.com.br', 'status': 200}\n",
    "{'upa': 41, 'pda': 42, 'url': 'https://www.maydaygames.com', 'status': 200}\n",
    "{'upa': 37, 'pda': 35, 'url': 'http://www.summoner.nl', 'status': 200}\n",
    "{'upa': 66, 'pda': 94, 'url': 'https://cdn.shopify.com', 'status': 403}\n",
    "{'upa': 59, 'pda': 71, 'url': 'https://www.fantasyflightgames.com', 'status': 200}\n",
    "{'upa': 59, 'pda': 92, 'url': 'https://media.giphy.com', 'status': 403}\n",
    "{'upa': 63, 'pda': 76, 'url': 'https://memegenerator.net', 'status': 200}\n",
    "{'upa': 32, 'pda': 30, 'url': 'https://www.planbgames.com', 'status': 200}\n",
    "{'upa': 42, 'pda': 37, 'url': 'https://strongholdgames.com', 'status': 200}\n",
    "{'upa': 52, 'pda': 58, 'url': 'https://www.yourlogicalfallacyis.com', 'status': 200}\n",
    "{'upa': 36, 'pda': 33, 'url': 'http://www.bordspelmania.eu', 'status': 200}\n",
    "{'upa': 29, 'pda': 28, 'url': 'http://bordspeler.nl', 'status': 200}\n",
    "{'upa': 100, 'pda': 100, 'url': 'https://twitter.com', 'status': 200}\n",
    "{'upa': 88, 'pda': 97, 'url': 'https://en.wikipedia.org', 'status': 200}\n",
    "{'upa': 52, 'pda': 54, 'url': 'http://www.coolstuffinc.com', 'status': 403}\n",
    "{'upa': 65, 'pda': 92, 'url': 'https://i.ytimg.com', 'status': 404}\n",
    "{'upa': 91, 'pda': 97, 'url': 'https://www.amazon.com', 'status': 503}\n",
    "{'upa': 71, 'pda': 91, 'url': 'https://www.amazon.ca', 'status': 200}\n",
    "{'upa': 27, 'pda': 26, 'url': 'http://www.apttogame.com', 'status': 200}\n",
    "{'upa': 40, 'pda': 34, 'url': 'http://www.eggertspiele.de', 'status': 200}\n",
    "{'upa': 65, 'pda': 93, 'url': 'https://s-media-cache-ak0.pinimg.com', 'status': 403}\n",
    "{'upa': 51, 'pda': 70, 'url': 'https://tshaonline.org', 'status': 200}\n",
    "{'upa': 85, 'pda': 95, 'url': 'https://www.etsy.com', 'status': 200}\n",
    "{'upa': 24, 'pda': 19, 'url': 'https://boardgameinnovation.com', 'status': 200}\n",
    "{'upa': 37, 'pda': 36, 'url': 'http://www.boardgamebliss.com', 'status': 200}\n",
    "{'upa': 37, 'pda': 42, 'url': 'http://frpgames.com', 'status': 200}\n",
    "{'upa': 49, 'pda': 52, 'url': 'http://www.philibertnet.com', 'status': 200}\n",
    "{'upa': 39, 'pda': 34, 'url': 'http://www.thirstymeeples.co.uk', 'status': 200}\n",
    "{'upa': 53, 'pda': 57, 'url': 'http://www.artscow.com', 'status': 200}\n",
    "{'upa': 81, 'pda': 97, 'url': 'https://itunes.apple.com', 'status': 200}\n",
    "{'upa': 6, 'pda': 7, 'url': 'http://boardgames.bplaced.net', 'status': 200}\n",
    "{'upa': 51, 'pda': 95, 'url': 'https://opinionatedgamers.files.wordpress.com', 'status': 200}\n",
    "{'upa': 7, 'pda': 9, 'url': 'http://eggertspiele.bplaced.net', 'status': 403}\n",
    "{'upa': 37, 'pda': 37, 'url': 'http://www.strongholdgames.com', 'status': 200}\n",
    "{'upa': 62, 'pda': 93, 'url': 'https://i.pinimg.com', 'status': 403}\n",
    "{'upa': 26, 'pda': 20, 'url': 'http://www.athenagames.com', 'status': 200}\n",
    "{'upa': 28, 'pda': 23, 'url': 'http://boardgamesinsider.com', 'status': 200}\n",
    "{'upa': 36, 'pda': 33, 'url': 'http://store.401games.ca', 'status': 200}\n",
    "{'upa': 41, 'pda': 46, 'url': 'http://www.boardgamequest.com', 'status': 200}\n",
    "{'upa': 32, 'pda': 34, 'url': 'http://brettspielbox.de', 'status': 200}\n",
    "{'upa': 25, 'pda': 25, 'url': 'http://www.brettspiel-news.de', 'status': 200}\n",
    "{'upa': 68, 'pda': 92, 'url': 'https://pbs.twimg.com', 'status': 400}\n",
    "{'upa': 25, 'pda': 36, 'url': 'https://www.cpforbes.net', 'status': 200}\n",
    "{'upa': 85, 'pda': 97, 'url': 'http://goo.gl', 'status': 200}\n",
    "{'upa': 65, 'pda': 83, 'url': 'https://www.boardgamegeek.com', 'status': 200}\n",
    "{'upa': 22, 'pda': 17, 'url': 'http://www.argfx.at', 'status': 200}\n",
    "{'upa': 43, 'pda': 42, 'url': 'https://www.blend4web.com', 'status': 200}\n",
    "{'upa': 37, 'pda': 32, 'url': 'http://www.plato-magazine.com', 'status': 200}\n",
    "{'upa': 35, 'pda': 37, 'url': 'http://www.vindjeu.eu', 'status': 200}\n",
    "{'upa': 52, 'pda': 54, 'url': 'https://www.coolstuffinc.com', 'status': 403}\n",
    "{'upa': 41, 'pda': 37, 'url': 'http://www.cardhaus.com', 'status': 200}\n",
    "{'upa': 53, 'pda': 56, 'url': 'http://www.webhallen.com', 'status': 200}\n",
    "{'upa': 48, 'pda': 53, 'url': 'http://www.cowcow.com', 'status': 200}\n",
    "{'upa': 67, 'pda': 80, 'url': 'https://www.rotary.org', 'status': 200}\n",
    "{'upa': 15, 'pda': 12, 'url': 'http://controlledareagaming.com', 'status': 200}\n",
    "{'upa': 75, 'pda': 92, 'url': 'https://www.twitch.tv', 'status': 200}\n",
    "{'upa': 78, 'pda': 93, 'url': 'https://www.amazon.de', 'status': 503}\n",
    "{'upa': 68, 'pda': 83, 'url': 'http://www.thingiverse.com', 'status': 200}\n",
    "{'upa': 23, 'pda': 19, 'url': 'http://www.boardgameinnovation.com', 'status': 200}\n",
    "{'upa': 67, 'pda': 95, 'url': 'https://m.imgur.com', 'status': 200}\n",
    "{'upa': 82, 'pda': 96, 'url': 'https://play.google.com', 'status': 200}\n",
    "{'upa': 1, 'pda': 0, 'url': 'http://concordiascore.azurewebsites.net', 'status': -1}\n",
    "{'upa': 48, 'pda': 95, 'url': 'https://thevirginiantv.files.wordpress.com', 'status': 200}\n",
    "{'upa': 51, 'pda': 50, 'url': 'http://www.miniaturemarket.com', 'status': 200}\n",
    "{'upa': 31, 'pda': 28, 'url': 'http://www.greatboardgames.ca', 'status': 200}\n",
    "{'upa': 89, 'pda': 98, 'url': 'https://www.reddit.com', 'status': 429}\n",
    "{'upa': 51, 'pda': 52, 'url': 'http://www.pegasus.de', 'status': 200}\n",
    "{'upa': 30, 'pda': 25, 'url': 'https://www.topshelfgamer.com', 'status': 200}\n",
    "{'upa': 19, 'pda': 16, 'url': 'http://fatcatgaming.co.uk', 'status': 200}\n",
    "{'upa': 40, 'pda': 36, 'url': 'http://www.thebrokentoken.com', 'status': 200}\n",
    "{'upa': 41, 'pda': 38, 'url': 'http://www.meeplesource.com', 'status': 200}\n",
    "{'upa': 79, 'pda': 93, 'url': 'https://www.kickstarter.com', 'status': 200}\n",
    "{'upa': 43, 'pda': 39, 'url': 'http://www.eaglegames.net', 'status': 200}\n",
    "{'upa': 81, 'pda': 96, 'url': 'https://youtu.be', 'status': 200}\n",
    "{'upa': 37, 'pda': 36, 'url': 'https://www.boardgamebliss.com', 'status': 200}\n",
    "{'upa': 45, 'pda': 44, 'url': 'http://1856.com', 'status': 403}\n",
    "{'upa': 23, 'pda': 18, 'url': 'http://www.unhalfbricking.com', 'status': 200}\n",
    "{'upa': 27, 'pda': 18, 'url': 'http://www.boardgamesearch.com.au', 'status': 200}\n",
    "{'upa': 46, 'pda': 74, 'url': 'https://m.media-amazon.com', 'status': 400}\n",
    "{'upa': 58, 'pda': 91, 'url': 'https://images-na.ssl-images-amazon.com', 'status': 400}\n",
    "{'upa': 30, 'pda': 34, 'url': 'http://eggertspiele.de', 'status': 200}\n",
    "{'upa': 82, 'pda': 95, 'url': 'https://imgur.com', 'status': 200}\n"
    ]
    }
    ],
    "source": [
    "for vals in x:\n",
    " da_pa = client.urlMetrics(vals, Mozscape.UMCols.domainAuthority | Mozscape.UMCols.pageAuthority)\n",
    " i = 0\n",
    " for y in da_pa:\n",
    " y['url'] = vals[i]\n",
    " try:\n",
    " r = requests.get(vals[i])\n",
    " y['status'] = r.status_code\n",
    " except requests.exceptions.ConnectionError:\n",
    " y['status'] = -1\n",
    " i = i+1\n",
    " df = df.append(y, ignore_index=True)\n",
    " print(y) \n",
    " time.sleep(5)"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 151,
    "metadata": {
    "collapsed": true
    },
    "outputs": [],
    "source": [
    "status_code_threshold = 400\n",
    "da_threshold = 25\n",
    "error_urls = df[(df['status'] >= status_code_threshold) & (df['pda'] >= da_threshold)]['url'].tolist()"
    ]
    },
    {
    "cell_type": "code",
    "execution_count": 152,
    "metadata": {},
    "outputs": [
    {
    "name": "stdout",
    "output_type": "stream",
    "text": [
    "amazon.co.uk status: UNAVAILABLE\n",
    "shopify.com status: UNAVAILABLE\n",
    "giphy.com status: UNAVAILABLE\n",
    "coolstuffinc.com status: UNAVAILABLE\n",
    "ytimg.com status: UNAVAILABLE\n",
    "amazon.com status: UNAVAILABLE\n",
    "pinimg.com status: UNAVAILABLE\n",
    "pinimg.com status: UNAVAILABLE\n",
    "twimg.com status: UNAVAILABLE\n",
    "coolstuffinc.com status: UNAVAILABLE\n",
    "amazon.de status: UNAVAILABLE\n",
    "reddit.com status: UNAVAILABLE\n",
    "1856.com status: UNAVAILABLE\n",
    "media-amazon.com status: UNAVAILABLE\n",
    "ssl-images-amazon.com status: UNAVAILABLE\n"
    ]
    }
    ],
    "source": [
    "whois_api_key = \"foo\"\n",
    "\n",
    "for x in error_urls:\n",
    " dnsapi = \"https://www.whoisxmlapi.com/whoisserver/WhoisService?apiKey=\" + whois_api_key + \"&outputFormat=JSON&cmd=GET_DN_AVAILABILITY&domainName=\" + x\n",
    " r = requests.get(dnsapi) \n",
    " parsed_json = json.loads(r.text)\n",
    " print(parsed_json['DomainInfo']['domainName'] + \" status: \" + parsed_json['DomainInfo']['domainAvailability'])"
    ]
    }
    ],
    "metadata": {
    "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
    },
    "language_info": {
    "codemirror_mode": {
    "name": "ipython",
    "version": 3
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
    "version": "3.6.2"
    }
    },
    "nbformat": 4,
    "nbformat_minor": 2
    }