Longest path finding with condition
First step is to normalize the sequences.
seqs = pd.concat([
df.drop(columns="end").rename(columns={"start":"node"}),
df.groupby("cusID").tail(1).drop(columns="start").rename(columns={"end":"node"})
])
seqs = seqs.sort_values("cusID", kind="mergesort").reset_index(drop=True)
>>> seqs
cusID node
0 001 A
1 001 B
2 001 C
3 001 D
4 001 A
5 001 E
6 001 A
7 002 B
8 002 C
9 002 D
10 002 E
Then, using zero_runs
we define:
def longest_non_a(seq):
eqa = seq == "A"
runs = zero_runs(eqa)
return (runs[:,1] - runs[:,0]).max()
result = seqs.groupby("cusID")["node"].apply(longest_non_a)
>>> result
cusID
001 3
002 4
Name: node, dtype: int64
As this is a graph problem I suggest you use networkx:
import networkx as nx
data = {
"cusID": ["001", "001", "001", "001", "001", "001", "002", "002", "002"],
"start": ["A", "B", "C", "D", "A", "E", "B", "C", "D"],
"end": ["B", "C", "D", "A", "E", "A", "C", "D", "E"]
}
df = pd.DataFrame(data)
def longest_path(d):
# create graph from edge list
dg = nx.convert_matrix.from_pandas_edgelist(d, source="start", target="end", create_using=nx.DiGraph)
# remove "A" if exists
if "A" in dg.nodes:
dg.remove_node("A")
# compute the longest path in the graph
return len(nx.dag.dag_longest_path(dg))
# group-by and compute the longest path
result = df.groupby("cusID").apply(longest_path).reset_index()
print(result)
Output
cusID 0
0 001 3
1 002 4