tabula python code example

Example 1: python red table from pdf

import tabula

# Read pdf into list of DataFrame
df = tabula.read_pdf("test.pdf", pages='all')

# Read remote pdf into list of DataFrame
df2 = tabula.read_pdf("https://github.com/tabulapdf/tabula-java/raw/master/src/test/resources/technology/tabula/arabic.pdf")

# convert PDF into CSV file
tabula.convert_into("test.pdf", "output.csv", output_format="csv", pages='all')

# convert all PDFs in a directory
tabula.convert_into_by_batch("input_directory", output_format='csv', pages='all')

Example 2: python ocr pdf dataframe

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
import pandas as pd
from PIL import Image
import pytesseract
import io
from wand.image import Image as wand
pd.set_option('max_colwidth', 2000)
pd.options.display.max_rows = 500