using beautifulsoup to web scrape code example
Example 1: webbscraping website with beautifulsoup
import requests
from bs4 import BeautifulSoup
URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
Example 2: BeautifulSoup - scraping list from html
from bs4 import BeautifulSoup
SIMPLE_HTML = '''<html>
<head></head>
<body>
<h1>This is a title</h1>
<p class="subtitle">Lorem ipsum dolor sit amet.</p>
<p>Here's another p without a class</p>
<ul>
<li>Sarah</li>
<li>Mary</li>
<li>Charlotte</li>
<li>Carl</li>
</ul>
</body>
</html>'''
simple_soup = BeautifulSoup(SIMPLE_HTML, 'html.parser')
def find_list():
list_items = simple_soup.find_all('li')
my_list = [e.string for e in list_items]
print(my_list)
find_list()
Example 3: Use Beautifulsoup or Scrapy to Scrape a Book Store
import scrapy
class bookScraper(scrapy.Spider):
name = "bookscrape"
start_urls = [
'http://books.toscrape.com/'
]
def parse(self, response):
all_books = response.css('.col-lg-3 ')
for book in all_books:
img_link = book.css('a img::attr(src)').extract()
title = book.css('h3 a::attr(title)').extract()
price = book.css('div.product_price p.price_color::text').extract()
yield {
'image_url' : img_link,
'book_title' : title,
'product_price' : price
}
next_page = response.css('li.next a::attr(href)').get()
if next_page is not None:
yield response.follow(next_page, callback = self.parse)