python, regex, many pdf files, extract text code example
Example 1: extract pdf text with python
from tika import parser
raw = parser.from_file('yourfile.pdf')
print(raw['content'])
Example 2: how to loop through pages of pdf using python
import PyPDF2
import re
for k in range(1,100):
object = PyPDF2.PdfFileReader("C:/my_path/file%s.pdf"%(k))
NumPages = object.getNumPages()
String = "New York State Real Property Law"
for i in range(0, NumPages):
PageObj = object.getPage(i)
print("this is page " + str(i))
Text = PageObj.extractText()
ResSearch = re.search(String, Text)
print(ResSearch)