Scraping Reddit Programmer Humor with Node.js using Puppeteer code example

Example: Scraping Reddit Programmer Humor with Node.js using Puppeteer

/*
    This code comes from Vincent Lab
    And it has a video version linked here: https://www.youtube.com/watch?v=Zb--639XePw
*/

// Import dependencies
const puppeteer = require("puppeteer");
const fs = require("fs");

(async () => {

    // The number of posts. 25 would give you about 100
    const amount = 25;

    // The location / URL
    const url = "https://www.reddit.com/r/ProgrammerHumor/";

    // All of the jokes as key-value pairs
    let jokesObject = {};

    console.log("Getting posts from Reddit");

    // Create the browser
    const browser = await puppeteer.launch({
        headless: false
    });

    // Navigate to the website
    const page = await browser.newPage();
    await page.goto(url, { waitUntil: "load" });

    // Get the root element of all the posts
    const root = (await page.$$(`.rpBJOHq2PR60pnwJlUyP0`))[0];

    // All the posts
    const posts = [];

    // For amount
    for (let i = 0; i < amount; i++) {

        // Get all the posts in this chunk
        const chunk = await (await root.$$("._1poyrkZ7g36PawDueRza-J"));

        // Add all the posts in this chunk to the posts array
        posts.push(...chunk);

        // Wait for 1 second
        await sleep(1000);

        // Scroll to the next chunk
        await page.evaluate(() => {
            window.scrollBy(0, (632 * 12));
        });

    }

    console.log("Extracting jokes from posts");

    // For each post
    for (const post of posts) {

        try {

            // Get the title
            const title = await getProperty(post, "textContent", "_eYtD2XCVieq6emjKBH3m");

            // Get the image jokesObject
            const image = await getProperty(post, "src", "ImageBox-image");

            // Add the post to the jokes object
            jokesObject[title] = { image: image };

        } catch (error) {

        }

    }

    console.log("Converting jokes into an array");

    // Convert the jokes object into an array
    const jokes = [];
    for (const joke in jokesObject) {
        jokes.push({
            title: joke,
            image: jokesObject[joke].image
        })
    }

    console.log("Saving jokes");

    // Save the jokes to a file
    fs.writeFileSync("jokes.json", JSON.stringify(jokes));

    // Close the browser
    await browser.close();

})();

// Get a property on an element from within an object
async function getProperty(rootElement, property, className) {
    const element = (await rootElement.$$(`.${className}`))[0];
    return await (await element.getProperty(property)).jsonValue();
}

// Sleep for x
function sleep(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

Tags:

Misc Example