puppeteer and expressJS web scraper code example
Example: Web Scraping with Node.js using Puppeteer
/*
This code comes from Vincent Lab
And it has a video version linked here: https://www.youtube.com/watch?v=GAvpZuVzeA8
*/
// AQI Air Pollution Level Health Implications Cautionary Statement (for PM2.5)
// 0 - 50 Good Air quality is considered satisfactory, and air pollution poses little or no risk None
// 51 -100 Moderate Air quality is acceptable; however, for some pollutants there may be a moderate health concern for a very small number of people who are unusually sensitive to air pollution. Active children and adults, and people with respiratory disease, such as asthma, should limit prolonged outdoor exertion.
// 101-150 Unhealthy for Sensitive Groups Members of sensitive groups may experience health effects. The general public is not likely to be affected. Active children and adults, and people with respiratory disease, such as asthma, should limit prolonged outdoor exertion.
// 151-200 Unhealthy Everyone may begin to experience health effects; members of sensitive groups may experience more serious health effects Active children and adults, and people with respiratory disease, such as asthma, should avoid prolonged outdoor exertion; everyone else, especially children, should limit prolonged outdoor exertion
// 201-300 Very Unhealthy Health warnings of emergency conditions. The entire population is more likely to be affected. Active children and adults, and people with respiratory disease, such as asthma, should avoid all outdoor exertion; everyone else, especially children, should limit outdoor exertion.
// 300+ Hazardous Health alert: everyone may experience more serious health effects Everyone should avoid all outdoor exertion
// Import dependencies
const puppeteer = require("puppeteer");
const moment = require("moment");
const fs = require("fs");
(async () => {
// The location / URL
const url = "https://aqicn.org/city/denmark/copenhagen/h.c.andersens-boulevard/";
// Create the browser
const browser = await puppeteer.launch({
headless: true
});
// Navigate to the website
const page = await browser.newPage();
await page.goto(url, { waitUntil: "load" });
// Get the Air quality index
const aqi = await page.$(`#aqiwgtvalue`);
// Extract the index
const index = await (await aqi.getProperty("textContent")).jsonValue();
// Extract the title
const title = await (await aqi.getProperty("title")).jsonValue();
// Extract the location
const location = await (await (await page.$(`#aqiwgttitle1`)).getProperty("textContent")).jsonValue();
// Write the data to a CSV file
if (fs.existsSync("air-pollution.csv")) {
fs.appendFileSync("air-pollution.csv", `\n${moment.utc()}, ${index}, ${title}, ${location.replace(/,/g, '')}`)
} else {
fs.writeFileSync("air-pollution.csv", `datetime, index, title, location\n${moment.utc()}, ${index}, ${title}, ${location.replace(/,/g, '')}`)
}
// Close the browser
await browser.close();