python, regex, many pdf files, extract text code example

Example 1: extract pdf text with python

# pip install tika
from tika import parser

raw = parser.from_file('yourfile.pdf')
print(raw['content'])

Example 2: how to loop through pages of pdf using python

import PyPDF2
import re

for k in range(1,100):
    # open the pdf file
    object = PyPDF2.PdfFileReader("C:/my_path/file%s.pdf"%(k))

    # get number of pages
    NumPages = object.getNumPages()

    # define keyterms
    String = "New York State Real Property Law"

    # extract text and do the search
    for i in range(0, NumPages):
        PageObj = object.getPage(i)
        print("this is page " + str(i)) 
        Text = PageObj.extractText() 
        # print(Text)
        ResSearch = re.search(String, Text)
        print(ResSearch)