web scraping beautifulsoup code example

Example 1: BeautifulSoup - scraping the link of the website

import requests
from bs4 import BeautifulSoup

page = requests.get('http://www.example.com')
soup = BeautifulSoup(page.content, 'html.parser')

print(soup.select_one('p a').attrs['href'])  # get the link of the website

Example 2: python web scraping

import requests
from bs4 import BeautifulSoup

URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')

Example 3: web scraping python

import scrapy
from ..items import SampletestItem #items class

class QuoteTestSpider(scrapy.Spider):
    name = 'quote_test'
    start_urls = ['https://quotes.toscrape.com/']

    def parse(self, response):
        items = SampletestItem() #items class
        quotes = response.css("div.quote")
        for quote in quotes:
            items['title'] = quote.css("span.text::text").get()
            items['author'] = quote.css(".author::text").get()
            items['tags'] = quote.css(".tags .tag::text").getall()
            
            yield items
            next_page = response.css(".next a::attr(href)").get()
            if next_page is not None:
                next_url = response.urljoin(next_page)
                yield scrapy.Request(next_url, callback=self.parse)

Example 4: BeautifulSoup - scraping list from html

from bs4 import BeautifulSoup

# Simple HTML
SIMPLE_HTML = '''<html>
<head></head>
<body>
<h1>This is a title</h1>
<p class="subtitle">Lorem ipsum dolor sit amet.</p>
<p>Here's another p without a class</p>
<ul>
    <li>Sarah</li>
    <li>Mary</li>
    <li>Charlotte</li>
    <li>Carl</li>
</ul>
</body>
</html>'''

simple_soup = BeautifulSoup(SIMPLE_HTML, 'html.parser')      # use html.parser in order to understand the simple HTML

# Find list from html
def find_list():
    list_items = simple_soup.find_all('li')
    my_list = [e.string for e in list_items]  # convert list_items to string
    print(my_list)
    
find_list()

Example 5: Use Beautifulsoup or Scrapy to Scrape a Book Store

import scrapy

class bookScraper(scrapy.Spider):
    name = "bookscrape"
    
    start_urls = [
            'http://books.toscrape.com/'
        ]
    def parse(self, response):
        all_books = response.css('.col-lg-3 ')
    
        for book in all_books:
            img_link = book.css('a img::attr(src)').extract()
            title = book.css('h3 a::attr(title)').extract()
            price = book.css('div.product_price p.price_color::text').extract()

            yield {
                'image_url' : img_link,
                'book_title' : title,
                'product_price' : price
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback = self.parse)

Tags:

Misc Example