Scraping wunderground without API, using python
They have added some additional tables at the top, just searching with table will not work now, I have used the class selector with the class name to fetch the record, it's working fine
tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CLASS_NAME, "mat-table.cdk-table.mat-sort.ng-star-inserted")))
Another direction: Use the API calls that the website is doing.
(The HTTP call was taken from Chrome developer tools)
Example:
HTTP GET https://api-ak.wunderground.com/api/d8585d80376a429e/history_20180812/lang:EN/units:english/bestfct:1/v:2.0/q/HSSS.json?showObs=0&ttl=120
Response
{
"response": {
"version": "2.0",
"units": "english",
"termsofService": "https://www.wunderground.com/weather/api/d/terms.html",
"attribution": {
"image":"//icons.wxug.com/graphics/wu2/logo_130x80.png",
"title":"Weather Underground",
"link":"http://www.wunderground.com"
},
"features": {
"history": 1
}
, "location": {
"name": "Khartoum",
"neighborhood":null,
"city": "Khartoum",
"state": null,
"state_name":"Sudan",
"country": "SD",
"country_iso3166":"SA",
"country_name":"Saudi Arabia",
"continent":"AS",
"zip":"00000",
"magic":"474",
"wmo":"62721",
"radarcode":"xxx",
"radarregion_ic":null,
"radarregion_link": "//",
"latitude":15.60000038,
"longitude":32.54999924,
"elevation":null,
"wfo": null,
"l": "/q/zmw:00000.474.62721",
"canonical": "/weather/sa/khartoum"
},
"date": {
"epoch": 1553287561,
"pretty": "11:46 PM EAT on March 22, 2019",
"rfc822": "Fri, 22 Mar 2019 23:46:01 +0300",
"iso8601": "2019-03-22T23:46:01+0300",
"year": 2019,
"month": 3,
"day": 22,
"yday": 80,
"hour": 23,
"min": "46",
"sec": 1,
"monthname": "March",
"monthname_short": "Mar",
"weekday": "Friday",
"weekday_short": "Fri",
"ampm": "PM",
"tz_short": "EAT",
"tz_long": "Africa/Khartoum",
"tz_offset_text": "+0300",
"tz_offset_hours": 3.00
}
}
,
"history": {
"start_date": {
"epoch": 1534064400,
"pretty": "12:00 PM EAT on August 12, 2018",
"rfc822": "Sun, 12 Aug 2018 12:00:00 +0300",
"iso8601": "2018-08-12T12:00:00+0300",
"year": 2018,
"month": 8,
"day": 12,
"yday": 223,
"hour": 12,
"min": "00",
"sec": 0,
"monthname": "August",
"monthname_short": "Aug",
"weekday": "Sunday",
"weekday_short": "Sun",
"ampm": "PM",
"tz_short": "EAT",
"tz_long": "Africa/Khartoum",
"tz_offset_text": "+0300",
"tz_offset_hours": 3.00
},
"end_date": {
"epoch": null,
"pretty": null,
"rfc822": null,
"iso8601": null,
"year": null,
"month": null,
"day": null,
"yday": null,
"hour": null,
"min": null,
"sec": null,
"monthname": null,
"monthname_short": null,
"weekday": null,
"weekday_short": null,
"ampm": null,
"tz_short": null,
"tz_long": null,
"tz_offset_text": null,
"tz_offset_hours": null
},
"days": [
{
"summary": {
"date": {
"epoch": 1534021200,
"pretty": "12:00 AM EAT on August 12, 2018",
"rfc822": "Sun, 12 Aug 2018 00:00:00 +0300",
"iso8601": "2018-08-12T00:00:00+0300",
"year": 2018,
"month": 8,
"day": 12,
"yday": 223,
"hour": 0,
"min": "00",
"sec": 0,
"monthname": "August",
"monthname_short": "Aug",
"weekday": "Sunday",
"weekday_short": "Sun",
"ampm": "AM",
"tz_short": "EAT",
"tz_long": "Africa/Khartoum",
"tz_offset_text": "+0300",
"tz_offset_hours": 3.00
},
"temperature": 82,
"dewpoint": 66,
"pressure": 29.94,
"wind_speed": 11,
"wind_dir": "SSE",
"wind_dir_degrees": 166,
"visibility": 5.9,
"humidity": 57,
"max_temperature": 89,
"min_temperature": 75,
"temperature_normal": null,
"min_temperature_normal": null,
"max_temperature_normal": null,
"min_temperature_record": null,
"max_temperature_record": null,
"min_temperature_record_year": null,
"max_temperature_record_year": null,
"max_humidity": 83,
"min_humidity": 40,
"max_dewpoint": 70,
"min_dewpoint": 63,
"max_pressure": 29.98,
"min_pressure": 29.89,
"max_wind_speed": 22,
"min_wind_speed": 5,
"max_visibility": 6.2,
"min_visibility": 1.9,
"fog": 0,
"hail": 0,
"snow": 0,
"rain": 1,
"thunder": 0,
"tornado": 0,
"snowfall": null,
"monthtodatesnowfall": null,
"since1julsnowfall": null,
"snowdepth": null,
"precip": 0.00,
"preciprecord": null,
"preciprecordyear": null,
"precipnormal": null,
"since1janprecipitation": null,
"since1janprecipitationnormal": null,
"monthtodateprecipitation": null,
"monthtodateprecipitationnormal": null,
"precipsource": "3Or6HourObs",
"gdegreedays": 32,
"heatingdegreedays": 0,
"coolingdegreedays": 17,
"heatingdegreedaysnormal": null,
"monthtodateheatingdegreedays": null,
"monthtodateheatingdegreedaysnormal": null,
"since1sepheatingdegreedays": null,
"since1sepheatingdegreedaysnormal": null,
"since1julheatingdegreedays": null,
"since1julheatingdegreedaysnormal": null,
"coolingdegreedaysnormal": null,
"monthtodatecoolingdegreedays": null,
"monthtodatecoolingdegreedaysnormal": null,
"since1sepcoolingdegreedays": null,
"since1sepcoolingdegreedaysnormal": null,
"since1jancoolingdegreedays": null,
"since1jancoolingdegreedaysnormal": null
,
"avgoktas": 5,
"icon": "rain"
}
}
]
}
}
I do it in the following manner.
I open developer tools using Ctrl+Shift+I
Then I submit a request via the website while recording the transactions (In this case, you just click on the View
button. Then I filter those for XHR.
In the requests that remain, I go over the responses for each request that remain. The response that looked like the one I wanted I take its request URL and use it. It might be best to copy the response to a separate JSON file
and beautify it so it's easy to read and determine if that's what you want.
In my scenario, my request URL was a get request to the following https://api.weather.com/v1/location/OLBA:9:LB/observations/historical.json?apiKey=_____________&units=e&startDate=20200305
I removed the API KEY from the URL above in order for me to use it
When you paste the URL into a browser you should get the same response and then you can use Python requests package to get the response and just parse the JSON.
you could use selenium to ensure page load then pandas read_html to get tables
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
url = 'https://www.wunderground.com/history/daily/sd/khartoum/HSSS/date/2019-03-12'
driver = webdriver.Chrome()
driver.get(url)
tables = WebDriverWait(driver,20).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "table")))
for table in tables:
newTable = pd.read_html(table.get_attribute('outerHTML'))
if newTable:
print(newTable[0].fillna(''))