how to extract the designated div table data in lxml?
Here is HTML you get using your code, table is inside a script tag :
<script type="text/template" id="tmpl_zyzb">
{{if (zyzb==null||zyzb.length<=0)}}
<div>
暂无数据
</div>
{{else}}
<table>
<tbody>
<tr>
<th class="tips-colname-Left">
<span>每股指标</span>
</th>
{{each zyzb as value i}}
<th class="tips-fieldname-Right" data-value="{{value.date}}">
<span>{{value.date.substr(2,8)}}</span>
</th>
{{/each}}
</tr>
To use Selenium you need to wait for page to be loaded. In the code below you can find example with WebDriverWait
:
from selenium.webdriver.support.ui import WebDriverWait
browser.get("http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059")
WebDriverWait(browser, 10).until(lambda d: d.execute_script(
'return ["complete", "interactive"].indexOf(document.readyState) != -1'))
root = lxml.html.document_fromstring(browser.page_source)
print(root.xpath("//*[@class='name']//strong")[0].text)
print(root.xpath("//div[@id='report_zyzb']//th//span")[0].text)
You can get information for all tables in the page using API requests. Each tab of the first 主要指标 section's table has one URL with different type
(0, 1 and 2) parameters. Same approach work for other tables also:
import requests
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Accept': '*/*',
'DNT': '1',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36',
'Referer': 'http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=sz300059',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'ru,en-US;q=0.9,en;q=0.8,tr;q=0.7',
}
# Section: 主要指标
section_url = 'http://f10.eastmoney.com/NewFinanceAnalysis/MainTargetAjax'
data_code = 'SZ300059'
with requests.Session() as s:
# 按报告期
data_type = 0
response = s.get(f'{section_url}?type={data_type}&code={data_code}', headers=headers, verify=False)
print(response.text)
# 按年度
data_type = 1
response = s.get(f'{section_url}?type={data_type}&code={data_code}', headers=headers, verify=False)
print(response.text)
# 按单季度
data_type = 2
response = s.get(f'{section_url}?type={data_type}&code={data_code}', headers=headers, verify=False)
print(response.text)
Json response (formatted):
[
{
"date":"2018-12-31",
"jbmgsy":"0.1858",
"kfmgsy":"0.1836",
"xsmgsy":"0.1858",
"mgjzc":"2.8010",
"mggjj":"1.0650",
"mgwfply":"0.6603",
"mgjyxjl":"0.5161",
"yyzsr":"31.2亿",
"mlr":"8.51亿",
"gsjlr":"9.59亿",
"kfjlr":"9.47亿",
"yyzsrtbzz":"22.64",
"gsjlrtbzz":"50.52",
"kfjlrtbzz":"53.87",
"yyzsrgdhbzz":"-1.17",
"gsjlrgdhbzz":"-2.19",
"kfjlrgdhbzz":"-0.92",
"jqjzcsyl":"6.32",
"tbjzcsyl":"6.11",
"tbzzcsyl":"2.35",
"mll":"27.25",
"jll":"30.68",
"sjsl":"14.51",
"yskyysr":"0.10",
"xsxjlyysr":"1.12",
"jyxjlyysr":"2.04",
"zzczzy":"0.08",
"yszkzzts":"80.72",
"chzzts":"139.58",
"zcfzl":"60.58",
"ldzczfz":"79.99",
"ldbl":"1.78",
"sdbl":"1.77"
},
{
"date":"2017-12-31",
"jbmgsy":"0.1488",
"kfmgsy":"0.1438",
"xsmgsy":"0.1488",
"mgjzc":"3.1381",
"mggjj":"1.4559",
"mgwfply":"0.6116",
"mgjyxjl":"-1.4363",
"yyzsr":"25.5亿",
"mlr":"4.75亿",
"gsjlr":"6.37亿",
"kfjlr":"6.16亿",
"yyzsrtbzz":"8.29",
"gsjlrtbzz":"-10.77",
"kfjlrtbzz":"3.43",
"yyzsrgdhbzz":"7.48",
"gsjlrgdhbzz":"6.80",
"kfjlrgdhbzz":"9.79",
"jqjzcsyl":"4.86",
"tbjzcsyl":"4.34",
"tbzzcsyl":"1.84",
"mll":"18.64",
"jll":"24.93",
"sjsl":"6.51",
"yskyysr":"0.12",
"xsxjlyysr":"1.05",
"jyxjlyysr":"-5.54",
"zzczzy":"0.07",
"yszkzzts":"98.08",
"chzzts":"125.67",
"zcfzl":"64.92",
"ldzczfz":"80.05",
"ldbl":"1.67",
"sdbl":"1.66"
},
{
"date":"2016-12-31",
"jbmgsy":"0.2059",
"kfmgsy":"0.1717",
"xsmgsy":"0.2059",
"mgjzc":"3.6042",
"mggjj":"1.9186",
"mgwfply":"0.6112",
"mgjyxjl":"-1.1882",
"yyzsr":"23.5亿",
"mlr":"6.47亿",
"gsjlr":"7.14亿",
"kfjlr":"5.95亿",
"yyzsrtbzz":"-19.62",
"gsjlrtbzz":"-61.39",
"kfjlrtbzz":"-66.86",
"yyzsrgdhbzz":"-1.13",
"gsjlrgdhbzz":"-24.72",
"kfjlrgdhbzz":"-26.92",
"jqjzcsyl":"6.60",
"tbjzcsyl":"5.57",
"tbzzcsyl":"2.81",
"mll":"27.49",
"jll":"30.29",
"sjsl":"10.74",
"yskyysr":"0.11",
"xsxjlyysr":"1.04",
"jyxjlyysr":"-3.51",
"zzczzy":"0.09",
"yszkzzts":"90.54",
"chzzts":"75.18",
"zcfzl":"52.45",
"ldzczfz":"97.77",
"ldbl":"1.56",
"sdbl":"1.55"
},
{
"date":"2015-12-31",
"jbmgsy":"1.0897",
"kfmgsy":"1.0585",
"xsmgsy":"1.0897",
"mgjzc":"4.4066",
"mggjj":"2.3754",
"mgwfply":"0.9065",
"mgjyxjl":"0.2953",
"yyzsr":"29.3亿",
"mlr":"20.5亿",
"gsjlr":"18.5亿",
"kfjlr":"18.0亿",
"yyzsrtbzz":"378.08",
"gsjlrtbzz":"1015.45",
"kfjlrtbzz":"1002.51",
"yyzsrgdhbzz":"13.62",
"gsjlrgdhbzz":"17.11",
"kfjlrgdhbzz":"14.51",
"jqjzcsyl":"66.42",
"tbjzcsyl":"22.63",
"tbzzcsyl":"12.36",
"mll":"70.05",
"jll":"63.18",
"sjsl":"14.85",
"yskyysr":"0.07",
"xsxjlyysr":"0.98",
"jyxjlyysr":"0.19",
"zzczzy":"0.20",
"yszkzzts":"27.67",
"chzzts":"--",
"zcfzl":"65.55",
"ldzczfz":"96.64",
"ldbl":"1.31",
"sdbl":"1.31"
},
{
"date":"2014-12-31",
"jbmgsy":"0.1370",
"kfmgsy":"0.1346",
"xsmgsy":"0.1370",
"mgjzc":"1.5540",
"mggjj":"0.2420",
"mgwfply":"0.2640",
"mgjyxjl":"1.9535",
"yyzsr":"6.12亿",
"mlr":"1.94亿",
"gsjlr":"1.66亿",
"kfjlr":"1.63亿",
"yyzsrtbzz":"146.31",
"gsjlrtbzz":"3213.59",
"kfjlrtbzz":"--",
"yyzsrgdhbzz":"39.62",
"gsjlrgdhbzz":"82.92",
"kfjlrgdhbzz":"90.55",
"jqjzcsyl":"9.38",
"tbjzcsyl":"8.82",
"tbzzcsyl":"3.85",
"mll":"31.68",
"jll":"27.07",
"sjsl":"16.01",
"yskyysr":"0.22",
"xsxjlyysr":"1.08",
"jyxjlyysr":"3.86",
"zzczzy":"0.14",
"yszkzzts":"45.05",
"chzzts":"--",
"zcfzl":"69.60",
"ldzczfz":"99.89",
"ldbl":"1.38",
"sdbl":"1.38"
},
{
"date":"2013-12-31",
"jbmgsy":"0.0100",
"kfmgsy":"-0.0039",
"xsmgsy":"0.0100",
"mgjzc":"2.5136",
"mggjj":"1.1785",
"mgwfply":"0.2745",
"mgjyxjl":"0.7084",
"yyzsr":"2.48亿",
"mlr":"-339万",
"gsjlr":"500万",
"kfjlr":"-262万",
"yyzsrtbzz":"11.57",
"gsjlrtbzz":"-86.69",
"kfjlrtbzz":"-108.51",
"yyzsrgdhbzz":"28.64",
"gsjlrgdhbzz":"--",
"kfjlrgdhbzz":"--",
"jqjzcsyl":"0.29",
"tbjzcsyl":"0.30",
"tbzzcsyl":"0.24",
"mll":"-1.36",
"jll":"2.01",
"sjsl":"-0.42",
"yskyysr":"0.39",
"xsxjlyysr":"0.94",
"jyxjlyysr":"1.92",
"zzczzy":"0.12",
"yszkzzts":"62.86",
"chzzts":"--",
"zcfzl":"30.57",
"ldzczfz":"99.25",
"ldbl":"3.02",
"sdbl":"3.02"
},
{
"date":"2012-12-31",
"jbmgsy":"0.1100",
"kfmgsy":"0.0900",
"xsmgsy":"0.1100",
"mgjzc":"5.1175",
"mggjj":"3.3624",
"mgwfply":"0.6399",
"mgjyxjl":"0.0600",
"yyzsr":"2.23亿",
"mlr":"3533万",
"gsjlr":"3758万",
"kfjlr":"3074万",
"yyzsrtbzz":"-20.55",
"gsjlrtbzz":"-64.72",
"kfjlrtbzz":"-68.18",
"yyzsrgdhbzz":"-12.07",
"gsjlrgdhbzz":"-45.99",
"kfjlrgdhbzz":"-50.55",
"jqjzcsyl":"2.20",
"tbjzcsyl":"2.19",
"tbzzcsyl":"2.07",
"mll":"15.86",
"jll":"16.88",
"sjsl":"13.29",
"yskyysr":"0.27",
"xsxjlyysr":"0.77",
"jyxjlyysr":"0.09",
"zzczzy":"0.12",
"yszkzzts":"56.91",
"chzzts":"--",
"zcfzl":"4.54",
"ldzczfz":"97.80",
"ldbl":"20.02",
"sdbl":"20.02"
},
{
"date":"2011-12-31",
"jbmgsy":"0.5100",
"kfmgsy":"0.4600",
"xsmgsy":"0.5100",
"mgjzc":"8.1000",
"mggjj":"5.9674",
"mgwfply":"0.9669",
"mgjyxjl":"0.7431",
"yyzsr":"2.80亿",
"mlr":"1.10亿",
"gsjlr":"1.07亿",
"kfjlr":"9661万",
"yyzsrtbzz":"51.55",
"gsjlrtbzz":"59.62",
"kfjlrtbzz":"35.11",
"yyzsrgdhbzz":"12.27",
"gsjlrgdhbzz":"11.64",
"kfjlrgdhbzz":"4.62",
"jqjzcsyl":"6.44",
"tbjzcsyl":"6.27",
"tbzzcsyl":"6.08",
"mll":"39.14",
"jll":"38.01",
"sjsl":"12.25",
"yskyysr":"0.39",
"xsxjlyysr":"1.12",
"jyxjlyysr":"0.56",
"zzczzy":"0.16",
"yszkzzts":"38.93",
"chzzts":"--",
"zcfzl":"6.76",
"ldzczfz":"100.00",
"ldbl":"13.13",
"sdbl":"13.13"
},
{
"date":"2010-12-31",
"jbmgsy":"0.5100",
"kfmgsy":"0.5400",
"xsmgsy":"0.5100",
"mgjzc":"11.5200",
"mggjj":"9.4387",
"mgwfply":"0.9209",
"mgjyxjl":"0.4991",
"yyzsr":"1.85亿",
"mlr":"7032万",
"gsjlr":"6674万",
"kfjlr":"7150万",
"yyzsrtbzz":"12.01",
"gsjlrtbzz":"-7.13",
"kfjlrtbzz":"6.78",
"yyzsrgdhbzz":"1.73",
"gsjlrgdhbzz":"-10.81",
"kfjlrgdhbzz":"0.68",
"jqjzcsyl":"5.27",
"tbjzcsyl":"4.14",
"tbzzcsyl":"6.67",
"mll":"38.02",
"jll":"36.10",
"sjsl":"9.82",
"yskyysr":"0.37",
"xsxjlyysr":"1.19",
"jyxjlyysr":"0.38",
"zzczzy":"0.18",
"yszkzzts":"50.99",
"chzzts":"--",
"zcfzl":"4.09",
"ldzczfz":"100.00",
"ldbl":"23.80",
"sdbl":"23.80"
}
]
When you crawl a website, it can cause unwanted consequences.
Make sure the site you are crawling does not prohibit you from doing so. If they say don't crawl the website, you should respect that.
I see in your code that use selenium and output html file:
UPDATE: Desiring to create stable code to work, according to Sers's suggestion that: should optimize the method of waiting for the website element has finished loading. I adjusted the code as follows:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
chrome_options = Options()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(options=chrome_options,
executable_path=r'F:\chromedriver.exe')
wait = WebDriverWait(browser, 20)
list_stock = ['sz300059', 'sz300766', 'sz002950']
try:
for id_stock in list_stock:
url_id = "http://f10.eastmoney.com/f10_v2/FinanceAnalysis.aspx?code=" + id_stock
browser.get(url_id)
# click to element 按年度 (Per year)
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
wait.until(EC.presence_of_all_elements_located([By.CSS_SELECTOR, "#zyzbTab > li:nth-child(2)"]))
element_per_year = browser.find_element_by_css_selector('#zyzbTab > li:nth-child(2)')
element_per_year.click()
# get table
wait.until(lambda e: e.execute_script('return document.readyState') != "loading")
wait.until(EC.presence_of_all_elements_located([By.CSS_SELECTOR, "#report_zyzb"]))
# time.sleep(5)
element_tb_per_year = browser.find_element_by_css_selector('#report_zyzb')
tb_per_year_html = element_tb_per_year.get_attribute('innerHTML')
path_file_html = fr'F:\test_{id_stock}.html'
with open(path_file_html, "w", encoding='utf-8') as fh:
fh.write(tb_per_year_html)
print(f'export id: {id_stock}')
except TimeoutException:
print("Timed out waiting for page to load")
finally:
browser.close()
browser.quit()
When WebDriverWait works incorrectly, I think you should use time.sleep. You can google more information about this.
This is image: