Skip to content

Instantly share code, notes, and snippets.

@thomas694
Last active April 22, 2023 19:01
Show Gist options
  • Select an option

  • Save thomas694/4be8bbcf63cba6b1b33665b6f3199743 to your computer and use it in GitHub Desktop.

Select an option

Save thomas694/4be8bbcf63cba6b1b33665b6f3199743 to your computer and use it in GitHub Desktop.
Download subreddit info with Google Sheets and Apps Script via pushshift.io
/*
Reddit Scraper
--------------
download subreddit info with Google Sheets and Apps Script via pushshift.io
notes: https://www.labnol.org/internet/web-scraping-reddit/28369/
usage: Extensions > Apps Script, insert script and save, reopen doc
and use new menu
extensions: 2022 by thomas694 (@GH)
===============
Reddit Scaper
===============
author: Amit Agarwal
email: amit@labnol.org
website: www.labnol.org
published: Feb 19, 2020
*/
/* ext1 - add functionality to sheets menu */
function onOpen() {
createCustomMenu();
}
function createCustomMenu() {
var menu = SpreadsheetApp.getUi().createMenu("Go Reddit");
menu.addItem("Fetch subreddit (tab name)", "FetchReddit");
menu.addToUi();
}
function FetchReddit() {
scrapeReddit();
}
/* ext1 - add functionality to sheets menu */
const getThumbnailLink_ = url => {
if (!/^http/.test(url)) return '';
return `=IMAGE("${url}")`;
};
const getHyperlink_ = (url, text) => {
if (!/^http/.test(url)) return '';
return `=HYPERLINK("${url}", "${text}")`;
};
const writeDataToSheets_ = data => {
const values = data.map(r => [
new Date(r.created_utc * 1000),
r.id, // ext2: additional columns
r.author, // ext2: additional columns
r.title,
getThumbnailLink_(r.thumbnail),
getHyperlink_(r.url, 'Link'),
getHyperlink_(r.full_link, 'Comments')
]);
const sheet = SpreadsheetApp.getActiveSheet();
sheet.getRange(sheet.getLastRow() + 1, 1, values.length, values[0].length).setValues(values);
SpreadsheetApp.flush();
};
const isRateLimited_ = () => {
const response = UrlFetchApp.fetch('https://api.pushshift.io/meta');
const { server_ratelimit_per_minute: limit } = JSON.parse(response);
return limit < 1;
};
const getAPIEndpoint_ = (subreddit, before = '') => {
// ext2: additional columns
//const fields = ['title', 'created_utc', 'url', 'thumbnail', 'full_link'];
const fields = ['id', 'author', 'title', 'created_utc', 'url', 'full_link'];
const size = 1000;
const base = 'https://api.pushshift.io/reddit/search/submission';
const params = { subreddit, size, fields: fields.join(',') };
if (before) params.before = before;
const query = Object.keys(params)
.map(key => `${key}=${params[key]}`)
.join('&');
return `${base}?${query}`;
};
/* ext3: dynamically get the subreddit name from the sheet name and delete unnecessary columns */
//const scrapeReddit = (subreddit = 'technology') => {
const scrapeReddit = () => {
const sheet = SpreadsheetApp.getActiveSheet();
let subreddit = sheet.getSheetName();
if (sheet.getMaxColumns() == 26) sheet.deleteColumns(8, 19);
/* ext3: dynamically get the subreddit name from the sheet name and delete unnecessary columns */
let before = '';
/* ext4: continue interrupted work at the oldest date */
const lastRow = sheet.getLastRow();
if (lastRow > 0) {
const values = sheet.getSheetValues(lastRow, 1, 1, 1);
before = (new Date(values[0][0]).valueOf()/1000).toFixed();
}
/* ext4: continue interrupted work at the oldest date */
do {
const apiUrl = getAPIEndpoint_(subreddit, before);
const response = UrlFetchApp.fetch(apiUrl);
const { data } = JSON.parse(response);
const { length } = data;
before = length > 0 ? String(data[length - 1].created_utc) : '';
if (length > 0) {
writeDataToSheets_(data);
}
} while (before !== '' && !isRateLimited_());
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment