scraping for media beutiful soup code example
Example 1: BeautifulSoup - scraping list from html
from bs4 import BeautifulSoup
# Simple HTML
SIMPLE_HTML = '''<html>
<head></head>
<body>
<h1>This is a title</h1>
<p class="subtitle">Lorem ipsum dolor sit amet.</p>
<p>Here's another p without a class</p>
<ul>
<li>Sarah</li>
<li>Mary</li>
<li>Charlotte</li>
<li>Carl</li>
</ul>
</body>
</html>'''
simple_soup = BeautifulSoup(SIMPLE_HTML, 'html.parser') # use html.parser in order to understand the simple HTML
# Find list from html
def find_list():
list_items = simple_soup.find_all('li')
my_list = [e.string for e in list_items] # convert list_items to string
print(my_list)
find_list()
Example 2: while scraping table data i am getting output as none
In [1]: from bs4 import BeautifulSoup
...: import requests
...: import re
...: url = "http://www.pro-football-reference.com/boxscores/201309050den.htm"
...:
...: html = requests.get(url).content
...: bsObj = BeautifulSoup(re.sub("","", html), "lxml")
...: officials = bsObj.find(id="officials")
...: print(officials)
...:
<table class="suppress_all sortable stats_table" data-cols-to-freeze="0" id="officials"><caption>Officials Table</caption><tr class="thead onecell"><td class=" center" colspan="2" data-stat="onecell">Officials</td></tr>
<tr><th class=" " data-stat="ref_pos" scope="row">Referee</th><td class=" " data-stat="name"><a href="/officials/ColeWa0r.htm">Walt Coleman</a></td></tr>
<tr><th class=" " data-stat="ref_pos" scope="row">Umpire</th><td class=" " data-stat="name"><a href="/officials/ElliRo0r.htm">Roy Ellison</a></td></tr>
<tr><th class=" " data-stat="ref_pos" scope="row">Head Linesman</th><td class=" " data-stat="name"><a href="/officials/BergJe1r.htm">Jerry Bergman</a></td></tr>
<tr><th class=" " data-stat="ref_pos" scope="row">Field Judge</th><td class=" " data-stat="name"><a href="/officials/GautGr0r.htm">Greg Gautreaux</a></td></tr>
<tr><th class=" " data-stat="ref_pos" scope="row">Back Judge</th><td class=" " data-stat="name"><a href="/officials/YettGr0r.htm">Greg Yette</a></td></tr>
<tr><th class=" " data-stat="ref_pos" scope="row">Side Judge</th><td class=" " data-stat="name"><a href="/officials/PattRi0r.htm">Rick Patterson</a></td></tr>
<tr><th class=" " data-stat="ref_pos" scope="row">Line Judge</th><td class=" " data-stat="name"><a href="/officials/BaynRu0r.htm">Rusty Baynes</a></td></tr>
</table>
In [2]: