Speeding Up Excel Data to Pandas
Here is a quick benchmark (extending this one). Apparently, using xlrd directly is slightly faster than pandas for the test .xlsx file. If .csv files are available, reading them is definitely much faster but converting them using LibreOffice is substantially slower:
pd_base 1.96 [in seconds]
pd_float 2.03
pd_object 2.01 [see cs95´s comment to your question]
pd_xlrd 1.95
pyxl_base 2.15
xlrd_base 1.79
csv_ready 0.17
csv_convert 18.72
Here is the code:
import pandas as pd
import openpyxl
import xlrd
import subprocess
file = 'test.xlsx'
df = pd.DataFrame([[i+j for i in range(50)] for j in range(100)])
df.to_excel(file, index=False)
df.to_csv(file.replace('.xlsx', '.csv'), index=False)
def pd_base():
df = pd.read_excel(file)
def pd_float():
df = pd.read_excel(file, dtype=np.int)
def pd_object():
df = pd.read_excel(file, sheet_name="Sheet1", dtype=object)
def pd_xlrd():
df = pd.read_excel(file, engine='xlrd')
def pyxl_base():
wb = openpyxl.load_workbook(file, read_only=True, keep_links=False, data_only=True)
sh = wb.active
df = pd.DataFrame(sh.values)
def xlrd_base():
wb = xlrd.open_workbook(file)
sh = wb.sheet_by_index(0)
df = pd.DataFrame([sh.row_values(n) for n in range(sh.nrows)])
def csv_ready():
df = pd.read_csv(file.replace('.xlsx', '.csv'))
def csv_convert():
out = subprocess.check_output(['libreoffice --headless --convert-to csv test.xlsx'], shell=True, stderr=subprocess.STDOUT)
df = pd.read_csv(file.replace('.xlsx', '.csv'))
def measure(func, nums=50):
temp = time.time()
for num in range(nums):
func()
diff = time.time() - temp
print(func.__name__, '%.2f' % diff)
for func in [pd_base, pd_float, pd_object, pd_xlrd, pyxl_base, xlrd_base, csv_ready, csv_convert]:
measure(func)