% \iffalse meta-comment
%
%% File: l3text-utils.dtx
%
% Copyright (C) 2026 The LaTeX Project
%
% It may be distributed and/or modified under the conditions of the
% LaTeX Project Public License (LPPL), either version 1.3c of this
% license or (at your option) any later version.  The latest version
% of this license is in the file
%
%    https://www.latex-project.org/lppl.txt
%
% This file is part of the "l3kernel bundle" (The Work in LPPL)
% and all files in that bundle must be distributed together.
%
% -----------------------------------------------------------------------
%
% The development version of the bundle can be found at
%
%    https://github.com/latex3/latex3
%
% for those people who are interested.
%
%<*driver>
\documentclass[full,kernel]{l3doc}
\begin{document}
  \DocInput{\jobname.dtx}
\end{document}
%</driver>
% \fi
%
% \title{^^A
%   The \pkg{l3text-utils} module\\ Text processing (support utilities)^^A
% }
%
% \author{^^A
%  The \LaTeX{} Project\thanks
%    {^^A
%      E-mail:
%        \href{mailto:latex-team@latex-project.org}
%          {latex-team@latex-project.org}^^A
%    }^^A
% }
%
% \date{Released 2026-03-20}
%
% \maketitle
%
% \begin{documentation}
%
% \end{documentation}
%
% \begin{implementation}
%
% \section{\pkg{l3text-utils} implementation}
%
%    \begin{macrocode}
%<*code>
%    \end{macrocode}
%
% \subsection{Parsing BCP~47 strings}
%
%    \begin{macrocode}
%<@@=text_bcp>
%    \end{macrocode}
%
% For a reference implementation in JavaScript, see
% \url{https://github.com/wooorm/bcp-47/}. This gives clear details of the
% overall algorithm needed.
%
% \begin{variable}{\c_@@_normal_prop}
%   There are a small number of non-standard tags which are grandfathered into
%   the current standard. Here, we set up a mapping to the equivalent standard
%   version, which allows us to avoid complexity. The commented lines have
%   no equivalent I can track down at the moment! The \texttt{prop} is made
%   into the linked form as this gains efficiency in the lookup.
%    \begin{macrocode}
\prop_const_from_keyval:Nn \c_@@_normal_prop
  {
    en-gb-oed   = en-gb-oxendict ,
    i-ami       = ami ,
    i-bnn       = bnn ,
%    i-default   = ,
%    i-enochian  = ,
    i-hak       = hak ,
    i-klingon   = tlh ,
    i-lux       = lb  ,
%    i-mingo     =     ,
    i-navajo    = nv  ,
    i-pwn       = pwn ,
    i-tao       = tao ,
    i-tay       = tay ,
    i-tsu       = tsu ,
    sgn-be-fr   = sfb ,
    sgn-be-nl   = vgt ,
    sgn-ch-de   = sgg ,
    art-lojban  = jbo ,
%    cel-gaulish =     ,
    no-bok      = nb  ,
    no-nyn      = nn  ,
    zh-guoyu    = cmn ,
    zh-hakka    = hak ,
%    zh-min      =     ,
    zh-min-nan  = nan ,
    zh-xiang    = hsn
  }
\prop_make_linked:N \c_@@_normal_prop
%    \end{macrocode}
% \end{variable}
%
% \begin{macro}[EXP]{\text_bcp_parse:n}
% \begin{macro}[EXP]{\@@_parse_auxi:n}
% \begin{macro}[EXP]{\@@_parse_auxii:nn}
% \begin{macro}[EXP]{\@@_parse_auxiii:n}
% \begin{macro}[EXP]{\@@_parse_auxiv:w}
% \begin{macro}[EXP]{\@@_parse_auxv:w}
% \begin{macro}[EXP]{\@@_parse_auxvi:n}
% \begin{macro}[EXP]{\@@_parse_auxvii:NNNN}
% \begin{macro}[EXP]{\@@_parse_extlang:n}
% \begin{macro}[EXP]{\@@_parse_extlang:nw}
% \begin{macro}[EXP]{\@@_parse_extlang:nn}
% \begin{macro}[EXP]{\@@_parse_script:n}
% \begin{macro}[EXP]{\@@_parse_script:w}
% \begin{macro}[EXP]{\@@_parse_region:n}
% \begin{macro}[EXP]{\@@_parse_region:w}
% \begin{macro}[EXP]{\@@_parse_variant_chk:n}
% \begin{macro}[EXP]{\@@_parse_variant_chk:NNNN}
% \begin{macro}[EXP]{\@@_parse_variant:n}
% \begin{macro}[EXP]{\@@_parse_variant:nw}
% \begin{macro}[EXP]{\@@_parse_variant:nn}
% \begin{macro}[EXP]{\@@_parse_variant_chk:nn}
% \begin{macro}[EXP]{\@@_parse_variant_chk:NNNNn}
% \begin{macro}[EXP]{\@@_parse_variant_end:n}
% \begin{macro}[EXP]{\@@_parse_ext:n}
% \begin{macro}[EXP]{\@@_parse_ext:nN}
% \begin{macro}[EXP]{\@@_parse_ext:nNnw}
% \begin{macro}[EXP]{\@@_parse_private:nw}
% \begin{macro}[EXP]{\@@_parse_count:n}
% \begin{macro}[EXP]{\@@_parse_count_auxi:w}
% \begin{macro}[EXP]{\@@_parse_count_auxii:w}
% \begin{macro}[EXP]{\@@_parse_count_auxiii:w}
% \begin{macro}[EXP]{\@@_parse_count_auxiv:N}
%   Before we get to the business end of the parse, we need to deal with the
%   special cases: entirely blank input or one of the non-standard inputs
%   above. We also want to deal with a string not a token list, but do that
%   once any replacement is sorted.
%    \begin{macrocode}
\cs_new:Npn \text_bcp_parse:n #1
  {
    \tl_if_blank:nTF {#1}
      { \msg_expandable_error:nn { text } { bcp-blank } }
      { \exp_args:Ne \@@_parse_auxi:n { \str_casefold:n {#1} } }
  }
\cs_new:Npn \@@_parse_auxi:n #1
  {
    \exp_args:Ne \@@_parse_auxii:nn
      { \prop_item:Nn \c_@@_normal_prop {#1} }
      {#1}
  }
\cs_new:Npn \@@_parse_auxii:nn #1#2
  {
    \tl_if_blank:nTF {#1}
      { \@@_parse_auxiii:n {#2} }
      { \@@_parse_auxiii:n {#1} }
  }
%    \end{macrocode}
%   The main loop is set up to allow us to iterate over each block, separated
%   by a |-| token. The first block \emph{must} be the language, which can be
%   either two or three characters: this is an easy test. There doesn't have
%   to be any other input, so things could well stop here. If not, we need
%   to know how many characters are in the next block to proceed.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_auxiii:n #1
  { 
    \@@_parse_auxiv:w #1 - \q_recursion_tail - \q_recursion_stop
  }
\cs_new:Npn \@@_parse_auxiv:w #1 -
  {
    \int_compare:nTF { 1 < \@@_parse_count:n {#1} < 4 }
      {
        {#1}
        \@@_parse_auxv:w
      }
      { \msg_expandable_error:nn { text } { bcp-invalid-lang } }
  }
%    \end{macrocode}
%   We will see versions of this several times. We know that there are a number
%   of valid subtag types at this point, differentiated by their length. (A
%   length of zero is never valid, but we do not special case it in the
%   counting code as it's quite unlikely.) There's therefore a split to choose
%   the appropriate subtag parser.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_auxv:w #1 -
  {
    \quark_if_recursion_tail_stop_do:nn {#1}
      { { } { } { } { } { } { } }
    \int_case:nnF { \@@_parse_count:n {#1} }
      {
        { 1 } { { } { } { } { } \@@_parse_ext:n }
        { 2 } { { } { } \@@_parse_region:n }
        { 3 } { \@@_parse_extlang:n }
        { 4 } { { } \@@_parse_auxvi:n }
        { 5 } { { } { } { } \@@_parse_variant:n }
        { 6 } { { } { } { } \@@_parse_variant:n }
        { 7 } { { } { } { } \@@_parse_variant:n }
        { 8 } { { } { } { } \@@_parse_variant:n }
      }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
        {#1}
  }
%    \end{macrocode}
%   There is one case that cannot be determined purely on block length.
%   Both variants and scripts can be made up of four characters, but for
%   variants the first character has to be a digit.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_auxvi:n #1
  { \@@_parse_auxvii:NNNN #1 }
\cs_new:Npn \@@_parse_auxvii:NNNN #1#2#3#4
  {
    \bool_lazy_or:nnTF
      { \int_compare_p:nNn {`#1} < { `0 } }
      { \int_compare_p:nNn {`#1} > { `9 } }
      { \@@_parse_script:n }
      { { } { } \@@_parse_variant:n }
        {#1#2#3#4}
  }
%    \end{macrocode}
%    The first block allowed after the language is \enquote{extended language},
%    which can have up to three entries of three characters.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_extlang:n #1
  { \@@_parse_extlang:nw { {#1} } }
\cs_new:Npn \@@_parse_extlang:nw #1#2 -
  {
    \quark_if_recursion_tail_stop_do:nn {#2}
      { {#1} { } { } { } { } { } }
    \int_case:nnF { \@@_parse_count:n {#2} }
      {
        { 1 } { { {#1} } { } { } { } \@@_parse_ext:n }
        { 2 } { { {#1} } { } \@@_parse_region:n }
        { 3 } { \@@_parse_extlang:nn {#1} }
        { 4 } { { {#1} } \@@_parse_auxvi:n }
        { 5 } { { {#1} } { } { } \@@_parse_variant:n }
        { 6 } { { {#1} } { } { } \@@_parse_variant:n }
        { 7 } { { {#1} } { } { } \@@_parse_variant:n }
        { 8 } { { {#1} } { } { } \@@_parse_variant:n }
      }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
        {#2}
  }
\cs_new:Npn \@@_parse_extlang:nn #1#2
  {
    \int_compare:nNnTF { \tl_count:n {#1} } = 3
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
      { \@@_parse_extlang:nw { #1 {#2} } }
  }
%    \end{macrocode}
%   The next valid block is a script: a single entry so not a lot to do.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_script:n #1
  {
    {#1}
    \@@_parse_script:w
  }
\cs_new:Npn \@@_parse_script:w #1 -
  {
    \quark_if_recursion_tail_stop_do:nn {#1}
      { { } { } { } { } }
    \int_case:nnF { \@@_parse_count:n {#1} }
      {
        { 1 } { { } { } \@@_parse_ext:n }
        { 2 } { \@@_parse_region:n }
        { 4 } { { } \@@_parse_variant_chk:n }
        { 5 } { { } \@@_parse_variant:n }
        { 6 } { { } \@@_parse_variant:n }
        { 7 } { { } \@@_parse_variant:n }
        { 8 } { { } \@@_parse_variant:n }
      }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
        {#1}
  }
%    \end{macrocode}
%   Much the same story for the region: a single block with simply fewer
%   possible blocks after it.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_region:n #1
  {
    {#1}
    \@@_parse_region:w
  }
\cs_new:Npn \@@_parse_region:w #1 -
  {
    \quark_if_recursion_tail_stop_do:nn {#1}
      { { } { } { } }
    \int_case:nnF { \@@_parse_count:n {#1} }
      {
        { 1 } { { } \@@_parse_ext:n }
        { 4 } { \@@_parse_variant_chk:n }
        { 5 } { \@@_parse_variant:n }
        { 6 } { \@@_parse_variant:n }
        { 7 } { \@@_parse_variant:n }
        { 8 } { \@@_parse_variant:n }
      }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
        {#1}
  }
%    \end{macrocode}
%   The same idea about a four-character block as we've already seen: to be a
%   valid variant, it has to start with a digit. Unlike the earlier version, at
%   this stage a script is not allowed, so anything except a leading digit is
%   an error.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_variant_chk:n #1
  { \@@_parse_variant_chk:NNNN #1 }
\cs_new:Npn \@@_parse_variant_chk:NNNN #1#2#3#4
  {
    \bool_lazy_or:nnTF
      { \int_compare_p:nNn {`#1} < { `0 } }
      { \int_compare_p:nNn {`#1} > { `9 } }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
      { \@@_parse_variant:n }
        {#1#2#3#4}
  }
%    \end{macrocode}
%   Variants form an open-ended list so a loop is required to handle this.
%   At each step, the length of the next block (if present) needs to checked:
%   if it's a valid variant, keep collecting, otherwise it's a extension or
%   an error.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_variant:n #1
  { \@@_parse_variant:nw { {#1} } }
\cs_new:Npn \@@_parse_variant:nw #1#2 -
  {
    \quark_if_recursion_tail_stop_do:nn {#2}
      { {#1} { } { } }
    \int_case:nnF { \@@_parse_count:n {#2} }
      {
        { 1 } { \@@_parse_variant_end:nn }
        { 4 } { \@@_parse_variant_chk:nn }
        { 5 } { \@@_parse_variant:nn }
        { 6 } { \@@_parse_variant:nn }
        { 7 } { \@@_parse_variant:nn }
        { 8 } { \@@_parse_variant:nn }
      }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
        {#1} {#2}
  }
\cs_new:Npn \@@_parse_variant:nn #1#2
  { \@@_parse_variant:nw { #2 {#1} } }
\cs_new:Npn \@@_parse_variant_chk:nn #1#2
  { \@@_parse_variant_chk:NNNNn #1 {#2} }
\cs_new:Npn \@@_parse_variant_chk:NNNNn #1#2#3#4
  {
    \bool_lazy_or:nnTF
      { \int_compare_p:nNn {`#1} < { `0 } }
      { \int_compare_p:nNn {`#1} > { `9 } }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
      { \@@_parse_variant:nn }
        {#1#2#3#4} {#2}
  }
\cs_new:Npn \@@_parse_variant_end:nn #1#2
  {
    {#2}
    \@@_parse_ext:n {#1}
  }
%    \end{macrocode}
%   There are only three possible valid one-letter blocks: the extensions
%   |t| and |u|, and the private use marker |x|. All of these then allow an
%   open-ended set of subtags, the only restriction being these cannot be
%   one-letter other than after |x|. So we need to collect up quite a bit
%   of information whilst allowing for the fact that only one |u| or |t| should
%   occur.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_ext:n #1
  {
    \str_if_eq:nnTF {#1} { x }
      {
        { }
        \@@_parse_private:nw { }
      }
      { \@@_parse_ext:nN { } #1 }
  }
%    \end{macrocode}
%   Test for a valid letter, then start collecting up or switch to the private
%   use area. Each extension can only be given once, and the comparison needs
%   to be case-insensitive, so there is a little work to do.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_ext:nN #1#2
  {
    \bool_lazy_or:nnTF
      { \str_if_eq_p:nn {#2} { t } }
      { \str_if_eq_p:nn {#2} { u } }
      {
        \bool_lazy_or:nnTF
          { \tl_if_head_eq_charcode_p:nN {#1} #2 }
          { \int_compare_p:nNn { \tl_count:n {#1} } > 2 }
          {
            \msg_expandable_error:nn { text } { bcp-invalid-subtag }
            \use_none_delimit_by_q_recursion_stop:w
          }
          { \@@_parse_ext:nNnw {#1} #2 { } }
      }      
      {
        \str_if_eq:nnTF {#2} { x }
          {
            {#1}
            \@@_parse_private:nw { }
          }
          {
            \msg_expandable_error:nn { text } { bcp-invalid-subtag }
            \use_none_delimit_by_q_recursion_stop:w
          }
      }
  }
%    \end{macrocode}
%   The loop for extensions: largely just collection with a test in case we
%   find another extension or private use marker.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_ext:nNnw #1#2#3#4 -
  {
    \quark_if_recursion_tail_stop_do:nn {#4}
      {
        \str_if_empty:nTF {#3}
          { \msg_expandable_error:nn { text } { bcp-invalid-subtag } }
          { { #1 #2 {#3} } { } }
      }
    \int_compare:nTF { 1 < \@@_parse_count:n {#4} < 9 }
      { \@@_parse_ext:nNnw {#1} #2 { #3 {#4} } }
      {
        \int_compare:nNnTF { \@@_parse_count:n {#4} } = 1
          {
            \str_if_empty:nTF {#3}
              {
                \msg_expandable_error:nn { text } { bcp-invalid-subtag }
                \use_none_delimit_by_q_recursion_stop:w
              }
              {
                \use:e
                  {
                    \exp_not:n { \@@_parse_ext:nN { #1 #2 {#3} } }
                      \str_casefold:n {#4}
                  }
              }
          }
          {
            \msg_expandable_error:nn { text } { bcp-invalid-subtag }
            \use_none_delimit_by_q_recursion_stop:w
          }
      }
  }
%    \end{macrocode}
%   Private use area: all bets are off! The only tests here is we need at least
%   one subtag, and they all need to be shorter than nine characters.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_private:nw #1#2 -
  {
    \quark_if_recursion_tail_stop_do:nn {#2}
      {
        \str_if_empty:nTF {#1}
          { \msg_expandable_error:nn { text } { bcp-invalid-subtag } }
          { {#1} }
      }
    \int_compare:nTF { 0 < \@@_parse_count:n {#2} < 9 }
      { \@@_parse_private:nw { #1 {#2} } }
      {
        \msg_expandable_error:nn { text } { bcp-invalid-subtag }
        \use_none_delimit_by_q_recursion_stop:w
      }
  }
%    \end{macrocode}
%   As BCP~47 is largely specified in terms of number of characters, there is
%   a need to count up repeatedly. Whilst \cs{tl_count:n} is reasonably fast,
%   the predictable nature of the input here means we can use a slightly
%   more focussed approach. We know that input can never be more than
%   8~characters, so can test for that, then get the character number by
%   a simple expansion. This saves around half the tracing lines for typical
%   input lengths.
%    \begin{macrocode}
\cs_new:Npn \@@_parse_count:n #1
  {
    \@@_parse_count_auxi:w #1
      \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil \q_nil
      \q_stop {#1}
  }
\cs_new:Npn \@@_parse_count_auxi:w #1#2#3#4#5#6#7#8#9
  {
    \quark_if_nil:NTF #9
      { \@@_parse_count_auxii:w }
      { \msg_expandable_error:nn { text } { bcp-invalid-lang } }
  }
\cs_new:Npn \@@_parse_count_auxii:w #1 \q_stop #2
  { \@@_parse_count_auxiii:w #2 876543210 \q_stop }
\cs_new:Npn \@@_parse_count_auxiii:w #1#2#3#4#5#6#7#8#9
  { \@@_parse_count_auxiv:N #9 }
\cs_new:Npn \@@_parse_count_auxiv:N #1#2 \q_stop {#1}
%    \end{macrocode}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
% \end{macro}
%
%    \begin{macrocode}
\msg_new:nnn { text } { bcp-blank }
  { Empty~input~for~BCP~47~decoding. }
\msg_new:nnn { text } { bcp-invalid-lang }
  { Invalid~language~in~BCP~input. }
\msg_new:nnn { text } { bcp-invalid-subtag }
  { Invalid~subtag~in~BCP~input. }
%    \end{macrocode}
%
%    \begin{macrocode}
%</code>
%    \end{macrocode}
%
% \end{implementation}
%
% \PrintIndex
