Parsing strings containing diacritical marks (macros?)
The following works with the input described above.
\documentclass{article}
\usepackage{xparse}
\ExplSyntaxOn
\NewDocumentCommand{\getargs}{m}{
\get_args:n {#1}
}
\int_new:N \narg
\cs_new_protected:Npn \get_args:n #1 {
\seq_set_split:Nnn \l_tmpa_seq {~} {#1}
\seq_map_inline:Nn \l_tmpa_seq {
\int_incr:N \l_tmpa_int
\tl_set:cx {arg\int_to_roman:n {\l_tmpa_int}}{##1}
}
\int_set:Nn \narg {\l_tmpa_int}
\int_zero:N \l_tmpa_int
}
\ExplSyntaxOff
\begin{document}
\getargs{mein hut} \argi~\argii $<$NO PROBLEM\\
BUT PUT UMLAUT ON THE u AND IT BREAKS
\getargs{mein h\"ut} \argi~\argii
\end{document}
Here is a solution with no expansion, able to parse macros and groups {...}
:
\documentclass{article}
\makeatletter
\newcount\arg@index
\newtoks\code@toks
\def\getargsF#1{%
\arg@index=1
\code@toks{}%
\parse@i#1\parse@stop
}
\def\add@tok#1{\code@toks\expandafter{\the\code@toks#1}}
\def\parse@stop{\parse@stop}
\def\parse@i{\futurelet\nxttok\parse@ii}
\def\parse@ii{%
\ifx\nxttok\parse@stop \let\next@action\parse@stop@i
\else
\ifx\nxttok\@sptoken \let\next@action\read@space
\else
\ifx\nxttok\bgroup \let\next@action\read@bracearg
\else \let\next@action\testtoken
\fi
\fi
\fi
\next@action
}
\def\parse@stop@i\parse@stop{\assign@arg}
\expandafter\def\expandafter\read@space\space{%
\assign@arg
\advance\arg@index1
\code@toks{}%
\parse@i
}
\def\read@bracearg#1{%
\add@tok{{#1}}%
\parse@i
}
\def\testtoken#1{%
\if\noexpand~\noexpand#1%
\ifnum\catcode`#1=\active
\assign@arg
\advance\arg@index1
\code@toks{}%
\else
\add@tok{#1}%
\fi
\else
\add@tok{#1}%
\fi
\parse@i
}
\def\assign@arg{%
\expandafter\edef\csname arg\romannumeral\arg@index\endcsname{\the\code@toks}%
}
\makeatother
\begin{document}
\getargsF{abcd efgh~h\"ut}\argi\#\argii\#\argiii\#
\getargsF{maths $1+1=2$ \textbf{b\textit{ol}d} \^a\'i\'o~hardspc~end}\argi\#\argii\#\argiii\#\argiv\#\argv\#\argvi\#
\catcode`\~12
\getargsF{maths $1+1=2$ \textbf{b\textit{ol}d} \^a\'i\'o~hardspc~end}\argi\#\argii\#\argiii\#\argiv\#
\end{document}
which gives
Once you have expanded everything in the original input with \protected@edef
you really don't need to step through token by token which makes it hard to handle any kind of brace group or command taking arguments. This just scans with a delimited argument looking for spaces which is much simpler.
\documentclass{article}
\makeatletter
\def\string@end{$\SaveHardspace}
\def\converttilde{T}
\newcounter{arg@index}
\let\SaveHardspace~%%%
\def\getargsF#1{%
\if T\converttilde\def~{ }\fi
\protected@edef\the@string{#1}%
\setcounter{arg@index}{0}%
\lowercase{\expandafter\parse@Block\the@string} \string@end
\let~\SaveHardspace
}
\def\parse@Block#1 {%
\stepcounter{arg@index}%
\@namedef{arg\roman{arg@index}}{#1}%
\futurelet\tmp\parse@Block@}
\def\parse@Block@{%
\ifx\tmp\string@end\edef\narg{\thearg@index}\expandafter\@gobble
\else\expandafter\parse@Block\fi}
\makeatother
\begin{document}
\getargsF{mein hut} \argi~\argii~[\narg] $<$ NO PROBLEM
BUT PUT UMLAUT ON THE u AND IT BREAKS
\getargsF{mein h\"ut see} \argi~\argii~[\narg]
\end{document}