Parse XML Sitemap with Python
I chose to use Requests and BeautifulSoup libraries. I created a dictionary where the key is the url and the value is the last modified date.
from bs4 import BeautifulSoup
import requests
xmlDict = {}
r = requests.get("http://www.site.co.uk/sitemap.xml")
xml = r.text
soup = BeautifulSoup(xml)
sitemapTags = soup.find_all("sitemap")
print "The number of sitemaps are {0}".format(len(sitemapTags))
for sitemap in sitemapTags:
xmlDict[sitemap.findNext("loc").text] = sitemap.findNext("lastmod").text
print xmlDict
Or with lxml:
from lxml import etree
import requests
xmlDict = {}
r = requests.get("http://www.site.co.uk/sitemap.xml")
root = etree.fromstring(r.content)
print "The number of sitemap tags are {0}".format(len(root))
for sitemap in root:
children = sitemap.getchildren()
xmlDict[children[0].text] = children[1].text
print xmlDict
this function will extract all urls from xml
from bs4 import BeautifulSoup
import requests
def get_urls_of_xml(xml_url):
r = requests.get(xml_url)
xml = r.text
soup = BeautifulSoup(xml)
links_arr = []
for link in soup.findAll('loc'):
linkstr = link.getText('', True)
links_arr.append(linkstr)
return links_arr
links_data_arr = get_urls_of_xml("https://www.gov.uk/sitemap.xml")
print(links_data_arr)
Here using BeautifulSoup
to get sitemap
count and extract text:
from bs4 import BeautifulSoup as bs
html = """
<sitemap>
<loc>
http://www.site.co.uk/drag_it/dragitsitemap_static_0.xml
</loc>
<lastmod>2015-07-07</lastmod>
</sitemap>
<sitemap>
<loc>
http://www.site.co.uk/drag_it/dragitsitemap_alpha_0.xml
</loc>
<lastmod>2015-07-07</lastmod>
</sitemap>
"""
soup = bs(html, "html.parser")
sitemap_count = len(soup.find_all('sitemap'))
print("sitemap count: %d" % sitemap)
print(soup.get_text())
Output:
sitemap count: 2
http://www.site.co.uk/drag_it/dragitsitemap_static_0.xml
2015-07-07
http://www.site.co.uk/drag_it/dragitsitemap_alpha_0.xml
2015-07-07
Using Python 3, requests, Pandas and list comprehension:
import requests
import pandas as pd
import xmltodict
url = "https://www.gov.uk/sitemap.xml"
res = requests.get(url)
raw = xmltodict.parse(res.text)
data = [[r["loc"], r["lastmod"]] for r in raw["sitemapindex"]["sitemap"]]
print("Number of sitemaps:", len(data))
df = pd.DataFrame(data, columns=["links", "lastmod"])
Output:
links lastmod
0 https://www.gov.uk/sitemaps/sitemap_1.xml 2018-11-06T01:10:02+00:00
1 https://www.gov.uk/sitemaps/sitemap_2.xml 2018-11-06T01:10:02+00:00
2 https://www.gov.uk/sitemaps/sitemap_3.xml 2018-11-06T01:10:02+00:00
3 https://www.gov.uk/sitemaps/sitemap_4.xml 2018-11-06T01:10:02+00:00
4 https://www.gov.uk/sitemaps/sitemap_5.xml 2018-11-06T01:10:02+00:00