python beautifulsoup cheatsheet code example

Example: beautifulsoup cheat sheet

web Scarping[1] Fetching Content -> HTTP Requests[2] Parsing Content [a] HTML Parsing [Beautiful Soup, RegEx, Grep] [b] DOM Parsing [Scarpy, Selenium] [c] Computer Vision [Sijuli]from bs4 import BeautifulSoupfrom bs4 import SoupStrainerimport requestsimport reimport webbrowserurl = "https://www.youtube.com/results?search_query=chess"# with open("index.html") as fp:    soup = BeautifulSoup(fp)html = """<p1 id="id" class="c1 c2">Par1</p1> <p>Par2</a></p>"""soup = BeautifulSoup(html, 'lxml')print(soup)             # <html><body><p1 class="c1 c2" id="id">Par1</p1> <p>Par2</p></body></html>print(type(soup))       # <class 'bs4.BeautifulSoup'>soup.prettify()print(soup.title)      # title of the page: <title>abc</title>soup.find_all(True)     # find all html elementsoup.p1.name = "p"      # renaming tag's namesoup.p['id'] = "my id"  # change id valuesoup.p['att1'] = '2'    # adding attributesdel soup.p['att1']      # removing attributesprint(soup)             # <html><body><p class="c1 c2" id="my id">Par1</p> <p>Par2</p></body></html>print(soup.p)           # <p class="c1 c2" id="my id">Par1</p>print(type(soup.p))     # <class 'bs4.element.Tag'>print(soup.p.name)      # pprint(soup.p.parent.name)   # bodyprint(soup.p['id'])                     # my idprint(soup.p['class'])                      # ['c1', 'c2']print(soup.p.get_attribute_list('class'))   # ['c1', 'c2']print(soup.p.attrs)             # {'id': 'my id', 'class': ['c1', 'c2']}print(soup.p.string)            # (tag contents) Par1print(type(soup.p.string))      # <class 'bs4.element.NavigableString'>############ Filtering Elements Using Find and Find All  ############print(soup.find('title'))           # will print 'None' if nothing is thereprint(soup.find('p'))                # <p class="c1 c2" id="my id">Par1</p>print(soup.find_all('p'))            # [<p class="c1 c2" id="my id">Par1</p>, <p><a href="google.com">Par2</a></p>]print(soup.find_all(['p', 'id']))print(soup.find_all(id="my id"))     # [<p class="c1 c2" id="my id">Par1</p>] (search by attributes)print(soup.find_all(re.compile('my')))print(soup.find_all(id=re.compile('my')))print(soup)print(soup.find(re.compile("^p")))       # will give the 1st tag line (starts with p) <p class="c1 c2" id="my id">Par1</p>print(soup.find_all(re.compile("^p")))   # will give all tag lines (starts with p)  [<p class="c1 c2" id="my id">Par1</p>, <p>Par2</p>]print(soup.find_all('p', attrs={'class': re.compile('^c1')}))print(soup.find_all(re.compile('p')))  # find all tag which contains pprint("hello")soup = BeautifulSoup('<a href="www.google.com">Home</a>', 'lxml')print(soup)                            # <a href="google.com">Home</a>print(soup.a['href'])                  # google.comprint(soup.get_text())              # Homesoup.a['rel'] = ['index', 'contents']  # <html><body><a href="google.com" rel="index contents">Home</a></body></html>print(soup)print(soup.get('href'))soup = BeautifulSoup("<b><!--comments--></b>", 'lxml')print(soup.i)           # commentprint(type(soup.i))     # <class 'bs4.element.Comment'>######  Searching and Filtering Using Custom Functionsdef has_class_but_no_id(tag):    return tag.has_attr('class') and tag.has_attr('id')print(soup.find_all(has_class_but_no_id))def not_lacie(href):    return href and not re.compile("lacie").search(href)soup.find_all(href=not_lacie)# [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,#  <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]###### function examplefrom bs4 import NavigableStringdef surrounded_by_strings(tag):    return (isinstance(tag.next_element, NavigableString)            and isinstance(tag.previous_element, NavigableString))for tag in soup.find_all(surrounded_by_strings):    print tag.name###### format all web links to full linksfrom bs4 import BeautifulSoupimport requestsimport reimport webbrowserurl = "https://mashable.com/2014/08/28/static-website-generators/"response = requests.get(url)soup = BeautifulSoup(response.text, 'lxml')#print(soup.find_all('a', attrs={'href': re.compile('^http|https')}))for link in soup.find_all('a', href=True):  # find all the links    if not link['href'].startswith('http'):        print('gggggggooooo')        link = url + link['href'].strip('/')    else:        link = link['href']    print(link)my_tags_only = SoupStrainer('a')                        # find by tag namemy_tags_only = SoupStrainer('div')my_tags_only = SoupStrainer('img')my_tags_only = SoupStrainer(rel="icon")                 # find by attributemy_tags_only = SoupStrainer(href=re.compile("link"))    # find all href which has "link" textsoup = BeautifulSoup(response.text, 'lxml', parse_only=my_tags_only)print(soup.prettify())

Tags:

Misc Example