from urllib.parse import urlparse from tld import get_tld from datetime import date, datetime import re, ssl, socket, requests import pandas as pd import numpy as ny import whois # 1= legitimate, -1= phishing, 0 =suspicious # using IP selfess class DataCleaning: def __init__(self, url): self.url = url self.path = urlparse(self.url) self.date = datetime.now() # print(self.path) def ip(self): valid = re.match( "^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$", self.path.netloc) if valid: return -1 else: return 1 # HEX in path def hex_url(self): valid = re.match("(0x[0-9A-F]+.){4}", self.path.netloc) if valid: return -1 else: return 1 # long url: leng<54 =1, leng >=54 || leng <=75 =0, >75 =-1 def long_url(self): if len(self.url) > 75: return -1 elif len(self.url) < 54: return 1 else: return 0 # URL shortening: tiny =-1, otherwise =1 def shorten(self): parsed = requests.head(self.url, allow_redirects=True).url if parsed != self.url: return -1 else: return 1 # having @ symbol: true =-1, false =1 def symbol(self): if '@' in self.url: return -1 else: return 1 # redirecting by // # // appear more than 1 =-1. else =1 def redirecting(self): if self.url.count('//') > 1: return -1 else: return 1 # existing of "-" : true =-1, false =1 # e.g. http://www.Confirme-paypal.com/ def domain_contain_symbol(self): if '-' in self.url: return -1 else: return 1 # subDomain and multi sub domain def domain_part(self): res = get_tld(self.url, as_object=True) domain = self.path.netloc # print(res.tld) d = domain.replace('.' + res.tld, '') d = d.replace('www.', '') # print(d.count('.')) if d.count('.') > 2: return -1 elif d.count('.') > 1: return 0 else: return 1 # https certificate: # trusted && >1yr =1, https && not trusted =0, otherwise =-1 def https_cert(self): try: hostname = self.path.netloc ctx = ssl.create_default_context() s = ctx.wrap_socket(socket.socket(), server_hostname=hostname) s.connect((hostname, 443)) cert = s.getpeercert() d0 = datetime.strptime(cert['notBefore'], '%b %d %H:%M:%S %Y %Z') d1 = datetime.strptime(cert['notAfter'], '%b %d %H:%M:%S %Y %Z') age = d1 - d0 issuer = dict(x[0] for x in cert['issuer']) issued_by = issuer['organizationName'] df = pd.read_csv('trustedCertAuthority.csv') authority = df.issuer.unique() if age.days > 365 and issued_by in authority: return 1 else: return 0 except: return -1 # the existence of https token in domain domainPart # e.g. http://https-www-paypal-it-webapps-mpp-home.soft-hair.com/ def domain_https(self): if 'https' in self.path.netloc: return -1 else: return 1 # age of domain # >=6 months =1, otherwise =-1 def age_of_domain(self): try: # print(self.path.netloc) domain = whois.query(self.path.netloc) reg_age = self.date - domain.creation_date if reg_age.days <= 365: return -1 else: return 1 except: return 0 # DNS records # no records =-1, otherwise =1 def dns_record(self): try: # print(self.path.netloc) domain = whois.query(self.path.netloc) if len(domain.name_servers) > 0: return 1 else: return -1 except: return 0