python beautifulsoup web scraping code example

Example 1: python web scraping

import requests
from bs4 import BeautifulSoup

URL = 'https://www.monster.com/jobs/search/?q=Software-Developer&where=Australia'
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')

Example 2: web scraper python

from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup

Example 3: Use Beautifulsoup or Scrapy to Scrape a Book Store

import scrapy

class bookScraper(scrapy.Spider):
    name = "bookscrape"
    
    start_urls = [
            'http://books.toscrape.com/'
        ]
    def parse(self, response):
        all_books = response.css('.col-lg-3 ')
    
        for book in all_books:
            img_link = book.css('a img::attr(src)').extract()
            title = book.css('h3 a::attr(title)').extract()
            price = book.css('div.product_price p.price_color::text').extract()

            yield {
                'image_url' : img_link,
                'book_title' : title,
                'product_price' : price
            }
        next_page = response.css('li.next a::attr(href)').get()
        if next_page is not None:
            yield response.follow(next_page, callback = self.parse)

Example 4: web scraper python

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

Tags:

Misc Example