beautifulsoup cheatsheet code example
Example: beautifulsoup cheat sheet
web Scarping[1] Fetching Content -> HTTP Requests[2] Parsing Content [a] HTML Parsing [Beautiful Soup, RegEx, Grep] [b] DOM Parsing [Scarpy, Selenium] [c] Computer Vision [Sijuli]from bs4 import BeautifulSoupfrom bs4 import SoupStrainerimport requestsimport reimport webbrowserurl = "https://www.youtube.com/results?search_query=chess"# with open("index.html") as fp: soup = BeautifulSoup(fp)html = """<p1 id="id" class="c1 c2">Par1</p1> <p>Par2</a></p>"""soup = BeautifulSoup(html, 'lxml')print(soup) # <html><body><p1 class="c1 c2" id="id">Par1</p1> <p>Par2</p></body></html>print(type(soup)) # <class 'bs4.BeautifulSoup'>soup.prettify()print(soup.title) # title of the page: <title>abc</title>soup.find_all(True) # find all html elementsoup.p1.name = "p" # renaming tag's namesoup.p['id'] = "my id" # change id valuesoup.p['att1'] = '2' # adding attributesdel soup.p['att1'] # removing attributesprint(soup) # <html><body><p class="c1 c2" id="my id">Par1</p> <p>Par2</p></body></html>print(soup.p) # <p class="c1 c2" id="my id">Par1</p>print(type(soup.p)) # <class 'bs4.element.Tag'>print(soup.p.name) # pprint(soup.p.parent.name) # bodyprint(soup.p['id']) # my idprint(soup.p['class']) # ['c1', 'c2']print(soup.p.get_attribute_list('class')) # ['c1', 'c2']print(soup.p.attrs) # {'id': 'my id', 'class': ['c1', 'c2']}print(soup.p.string) # (tag contents) Par1print(type(soup.p.string)) # <class 'bs4.element.NavigableString'>############ Filtering Elements Using Find and Find All ############print(soup.find('title')) # will print 'None' if nothing is thereprint(soup.find('p')) # <p class="c1 c2" id="my id">Par1</p>print(soup.find_all('p')) # [<p class="c1 c2" id="my id">Par1</p>, <p><a href="google.com">Par2</a></p>]print(soup.find_all(['p', 'id']))print(soup.find_all(id="my id")) # [<p class="c1 c2" id="my id">Par1</p>] (search by attributes)print(soup.find_all(re.compile('my')))print(soup.find_all(id=re.compile('my')))print(soup)print(soup.find(re.compile("^p"))) # will give the 1st tag line (starts with p) <p class="c1 c2" id="my id">Par1</p>print(soup.find_all(re.compile("^p"))) # will give all tag lines (starts with p) [<p class="c1 c2" id="my id">Par1</p>, <p>Par2</p>]print(soup.find_all('p', attrs={'class': re.compile('^c1')}))print(soup.find_all(re.compile('p'))) # find all tag which contains pprint("hello")soup = BeautifulSoup('<a href="www.google.com">Home</a>', 'lxml')print(soup) # <a href="google.com">Home</a>print(soup.a['href']) # google.comprint(soup.get_text()) # Homesoup.a['rel'] = ['index', 'contents'] # <html><body><a href="google.com" rel="index contents">Home</a></body></html>print(soup)print(soup.get('href'))soup = BeautifulSoup("<b></b>", 'lxml')print(soup.i) # commentprint(type(soup.i)) # <class 'bs4.element.Comment'>###### Searching and Filtering Using Custom Functionsdef has_class_but_no_id(tag): return tag.has_attr('class') and tag.has_attr('id')print(soup.find_all(has_class_but_no_id))def not_lacie(href): return href and not re.compile("lacie").search(href)soup.find_all(href=not_lacie)# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]###### function examplefrom bs4 import NavigableStringdef surrounded_by_strings(tag): return (isinstance(tag.next_element, NavigableString) and isinstance(tag.previous_element, NavigableString))for tag in soup.find_all(surrounded_by_strings): print tag.name###### format all web links to full linksfrom bs4 import BeautifulSoupimport requestsimport reimport webbrowserurl = "https://mashable.com/2014/08/28/static-website-generators/"response = requests.get(url)soup = BeautifulSoup(response.text, 'lxml')#print(soup.find_all('a', attrs={'href': re.compile('^http|https')}))for link in soup.find_all('a', href=True): # find all the links if not link['href'].startswith('http'): print('gggggggooooo') link = url + link['href'].strip('/') else: link = link['href'] print(link)my_tags_only = SoupStrainer('a') # find by tag namemy_tags_only = SoupStrainer('div')my_tags_only = SoupStrainer('img')my_tags_only = SoupStrainer(rel="icon") # find by attributemy_tags_only = SoupStrainer(href=re.compile("link")) # find all href which has "link" textsoup = BeautifulSoup(response.text, 'lxml', parse_only=my_tags_only)print(soup.prettify())