How to parse complex text files using Python?
Update 2019 (PEG parser):
This answer has received quite some attention so I felt to add another possibility, namely a parsing option. Here we could use a PEG
parser instead (e.g. parsimonious
) in combination with a NodeVisitor
class:
from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import pandas as pd
grammar = Grammar(
r"""
schools = (school_block / ws)+
school_block = school_header ws grade_block+
grade_block = grade_header ws name_header ws (number_name)+ ws score_header ws (number_score)+ ws?
school_header = ~"^School = (.*)"m
grade_header = ~"^Grade = (\d+)"m
name_header = "Student number, Name"
score_header = "Student number, Score"
number_name = index comma name ws
number_score = index comma score ws
comma = ws? "," ws?
index = number+
score = number+
number = ~"\d+"
name = ~"[A-Z]\w+"
ws = ~"\s*"
"""
)
tree = grammar.parse(data)
class SchoolVisitor(NodeVisitor):
output, names = ([], [])
current_school, current_grade = None, None
def _getName(self, idx):
for index, name in self.names:
if index == idx:
return name
def generic_visit(self, node, visited_children):
return node.text or visited_children
def visit_school_header(self, node, children):
self.current_school = node.match.group(1)
def visit_grade_header(self, node, children):
self.current_grade = node.match.group(1)
self.names = []
def visit_number_name(self, node, children):
index, name = None, None
for child in node.children:
if child.expr.name == 'name':
name = child.text
elif child.expr.name == 'index':
index = child.text
self.names.append((index, name))
def visit_number_score(self, node, children):
index, score = None, None
for child in node.children:
if child.expr.name == 'index':
index = child.text
elif child.expr.name == 'score':
score = child.text
name = self._getName(index)
# build the entire entry
entry = (self.current_school, self.current_grade, index, name, score)
self.output.append(entry)
sv = SchoolVisitor()
sv.visit(tree)
df = pd.DataFrame.from_records(sv.output, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)
Regex option (original answer)
Well then, watching Lord of the Rings the xth time, I had to bridge some time to the very finale:
Broken down, the idea is to split the problem up into several smaller problems:
- Separate each school
- ... each grade
- ... student and scores
- ... bind them together in a dataframe afterwards
The school part (see a demo on regex101.com)
^
School\s*=\s*(?P<school_name>.+)
(?P<school_content>[\s\S]+?)
(?=^School|\Z)
The grade part (another demo on regex101.com)
^
Grade\s*=\s*(?P<grade>.+)
(?P<students>[\s\S]+?)
(?=^Grade|\Z)
The student/score part (last demo on regex101.com):
^
Student\ number,\ Name[\n\r]
(?P<student_names>(?:^\d+.+[\n\r])+)
\s*
^
Student\ number,\ Score[\n\r]
(?P<student_scores>(?:^\d+.+[\n\r])+)
The rest is a generator expression which is then fed into the DataFrame
constructor (along with the column names).
The code:
import pandas as pd, re
rx_school = re.compile(r'''
^
School\s*=\s*(?P<school_name>.+)
(?P<school_content>[\s\S]+?)
(?=^School|\Z)
''', re.MULTILINE | re.VERBOSE)
rx_grade = re.compile(r'''
^
Grade\s*=\s*(?P<grade>.+)
(?P<students>[\s\S]+?)
(?=^Grade|\Z)
''', re.MULTILINE | re.VERBOSE)
rx_student_score = re.compile(r'''
^
Student\ number,\ Name[\n\r]
(?P<student_names>(?:^\d+.+[\n\r])+)
\s*
^
Student\ number,\ Score[\n\r]
(?P<student_scores>(?:^\d+.+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)
result = ((school.group('school_name'), grade.group('grade'), student_number, name, score)
for school in rx_school.finditer(string)
for grade in rx_grade.finditer(school.group('school_content'))
for student_score in rx_student_score.finditer(grade.group('students'))
for student in zip(student_score.group('student_names')[:-1].split("\n"), student_score.group('student_scores')[:-1].split("\n"))
for student_number in [student[0].split(", ")[0]]
for name in [student[0].split(", ")[1]]
for score in [student[1].split(", ")[1]]
)
df = pd.DataFrame(result, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)
Condensed:
rx_school = re.compile(r'^School\s*=\s*(?P<school_name>.+)(?P<school_content>[\s\S]+?)(?=^School|\Z)', re.MULTILINE)
rx_grade = re.compile(r'^Grade\s*=\s*(?P<grade>.+)(?P<students>[\s\S]+?)(?=^Grade|\Z)', re.MULTILINE)
rx_student_score = re.compile(r'^Student number, Name[\n\r](?P<student_names>(?:^\d+.+[\n\r])+)\s*^Student number, Score[\n\r](?P<student_scores>(?:^\d+.+[\n\r])+)', re.MULTILINE)
This yields
School Grade Student number Name Score
0 Riverdale High 1 0 Phoebe 3
1 Riverdale High 1 1 Rachel 7
2 Riverdale High 2 0 Angela 6
3 Riverdale High 2 1 Tristan 3
4 Riverdale High 2 2 Aurora 9
5 Hogwarts 1 0 Ginny 8
6 Hogwarts 1 1 Luna 7
7 Hogwarts 2 0 Harry 5
8 Hogwarts 2 1 Hermione 10
9 Hogwarts 3 0 Fred 0
10 Hogwarts 3 1 George 0
As for timing, this is the result running it a ten thousand times:
import timeit
print(timeit.timeit(makedf, number=10**4))
# 11.918397722000009 s
here is my suggestion using split and pd.concat ("txt" stands for a copy of the original text in the question), basicly the idea is to split by the group words and then concat into data frames, the most inner parsing takes advantage of the fact that the names and grades are in a csv like format. here goes:
import pandas as pd
from io import StringIO
schools = txt.lower().split('school = ')
schools_dfs = []
for school in schools[1:]:
grades = school.split('grade = ')
grades_dfs = []
for grade in grades[1:]:
features = grade.split('student number,')
feature_dfs = []
for feature in features[1:]:
feature_dfs.append(pd.read_csv(StringIO(feature)))
feature_df = pd.concat(feature_dfs, axis=1)
feature_df['grade'] = features[0].replace('\n','')
grades_dfs.append(feature_df)
grades_df = pd.concat(grades_dfs)
grades_df['school'] = grades[0].replace('\n','')
schools_dfs.append(grades_df)
schools_df = pd.concat(schools_dfs)
schools_df.set_index(['school', 'grade'])