Last active
April 22, 2023 19:01
-
-
Save thomas694/4be8bbcf63cba6b1b33665b6f3199743 to your computer and use it in GitHub Desktop.
Download subreddit info with Google Sheets and Apps Script via pushshift.io
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| Reddit Scraper | |
| -------------- | |
| download subreddit info with Google Sheets and Apps Script via pushshift.io | |
| notes: https://www.labnol.org/internet/web-scraping-reddit/28369/ | |
| usage: Extensions > Apps Script, insert script and save, reopen doc | |
| and use new menu | |
| extensions: 2022 by thomas694 (@GH) | |
| =============== | |
| Reddit Scaper | |
| =============== | |
| author: Amit Agarwal | |
| email: amit@labnol.org | |
| website: www.labnol.org | |
| published: Feb 19, 2020 | |
| */ | |
| /* ext1 - add functionality to sheets menu */ | |
| function onOpen() { | |
| createCustomMenu(); | |
| } | |
| function createCustomMenu() { | |
| var menu = SpreadsheetApp.getUi().createMenu("Go Reddit"); | |
| menu.addItem("Fetch subreddit (tab name)", "FetchReddit"); | |
| menu.addToUi(); | |
| } | |
| function FetchReddit() { | |
| scrapeReddit(); | |
| } | |
| /* ext1 - add functionality to sheets menu */ | |
| const getThumbnailLink_ = url => { | |
| if (!/^http/.test(url)) return ''; | |
| return `=IMAGE("${url}")`; | |
| }; | |
| const getHyperlink_ = (url, text) => { | |
| if (!/^http/.test(url)) return ''; | |
| return `=HYPERLINK("${url}", "${text}")`; | |
| }; | |
| const writeDataToSheets_ = data => { | |
| const values = data.map(r => [ | |
| new Date(r.created_utc * 1000), | |
| r.id, // ext2: additional columns | |
| r.author, // ext2: additional columns | |
| r.title, | |
| getThumbnailLink_(r.thumbnail), | |
| getHyperlink_(r.url, 'Link'), | |
| getHyperlink_(r.full_link, 'Comments') | |
| ]); | |
| const sheet = SpreadsheetApp.getActiveSheet(); | |
| sheet.getRange(sheet.getLastRow() + 1, 1, values.length, values[0].length).setValues(values); | |
| SpreadsheetApp.flush(); | |
| }; | |
| const isRateLimited_ = () => { | |
| const response = UrlFetchApp.fetch('https://api.pushshift.io/meta'); | |
| const { server_ratelimit_per_minute: limit } = JSON.parse(response); | |
| return limit < 1; | |
| }; | |
| const getAPIEndpoint_ = (subreddit, before = '') => { | |
| // ext2: additional columns | |
| //const fields = ['title', 'created_utc', 'url', 'thumbnail', 'full_link']; | |
| const fields = ['id', 'author', 'title', 'created_utc', 'url', 'full_link']; | |
| const size = 1000; | |
| const base = 'https://api.pushshift.io/reddit/search/submission'; | |
| const params = { subreddit, size, fields: fields.join(',') }; | |
| if (before) params.before = before; | |
| const query = Object.keys(params) | |
| .map(key => `${key}=${params[key]}`) | |
| .join('&'); | |
| return `${base}?${query}`; | |
| }; | |
| /* ext3: dynamically get the subreddit name from the sheet name and delete unnecessary columns */ | |
| //const scrapeReddit = (subreddit = 'technology') => { | |
| const scrapeReddit = () => { | |
| const sheet = SpreadsheetApp.getActiveSheet(); | |
| let subreddit = sheet.getSheetName(); | |
| if (sheet.getMaxColumns() == 26) sheet.deleteColumns(8, 19); | |
| /* ext3: dynamically get the subreddit name from the sheet name and delete unnecessary columns */ | |
| let before = ''; | |
| /* ext4: continue interrupted work at the oldest date */ | |
| const lastRow = sheet.getLastRow(); | |
| if (lastRow > 0) { | |
| const values = sheet.getSheetValues(lastRow, 1, 1, 1); | |
| before = (new Date(values[0][0]).valueOf()/1000).toFixed(); | |
| } | |
| /* ext4: continue interrupted work at the oldest date */ | |
| do { | |
| const apiUrl = getAPIEndpoint_(subreddit, before); | |
| const response = UrlFetchApp.fetch(apiUrl); | |
| const { data } = JSON.parse(response); | |
| const { length } = data; | |
| before = length > 0 ? String(data[length - 1].created_utc) : ''; | |
| if (length > 0) { | |
| writeDataToSheets_(data); | |
| } | |
| } while (before !== '' && !isRateLimited_()); | |
| }; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment