Skip to content

Instantly share code, notes, and snippets.

@rahul1906
Last active September 9, 2019 19:14
Show Gist options
  • Select an option

  • Save rahul1906/23faed2a75d4887a9899bd76a5ad7adb to your computer and use it in GitHub Desktop.

Select an option

Save rahul1906/23faed2a75d4887a9899bd76a5ad7adb to your computer and use it in GitHub Desktop.
scraping metadata of machine learning mastery blogs
from bs4 import BeautifulSoup as bs
import requests
from tqdm import tqdm
import pandas as pd
count = 0
total_list = []
for i in tqdm(list(range(1,70))[::-1]):
#print(i)
url = "https://machinelearningmastery.com/blog/page/{}".format(i)
#print(url)
page = requests.get(url, headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'})
soup=bs(page.content)
articles = soup.find_all('article')
for article in articles :
count += 1
attr_dict = {}
attr_dict['index'] = count
attr_dict['page_no'] = i
attr_dict['title'] = article.find('a').get('title')
attr_dict['url'] = article.find('a').get('href')
attr_dict['short_description'] = article.find('p').string
total_list.append(attr_dict)
df = pd.DataFrame.from_dict(total_list)
# df1 = pd.DataFrame(total_list) # both above and below works
"""
loads json list or list into pandas df to use this the records should be in below format
refer : https://stackoverflow.com/questions/20638006/convert-list-of-dictionaries-to-dataframe/33020669#33020669
[{'a' : 1, 'b' : 2},
{'a' : 2 , 'b, 3'},
{'b' : 4, 'c' : 5}]
"""
print(df.shape)
df.head()
df.to_excel('machine_learning_master_blogs_metadata.xlsx', index=False)
df.to_csv('machine_learning_master_blogs_metadata.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment