pcre2api.3 | pcre2api.3 | |||
---|---|---|---|---|
skipping to change at line 101 | skipping to change at line 101 | |||
int pcre2_set_newline(pcre2_compile_context *ccontext, | int pcre2_set_newline(pcre2_compile_context *ccontext, | |||
uint32_t value); | uint32_t value); | |||
int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, | int pcre2_set_parens_nest_limit(pcre2_compile_context *ccontext, | |||
uint32_t value); | uint32_t value); | |||
int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontex t, | int pcre2_set_compile_recursion_guard(pcre2_compile_context *ccontex t, | |||
int (*guard_function)(uint32_t, void *), void *user_data); | int (*guard_function)(uint32_t, void *), void *user_data); | |||
int pcre2_set_optimize(pcre2_compile_context *ccontext, | ||||
uint32_t directive); | ||||
PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS | PCRE2 NATIVE API MATCH CONTEXT FUNCTIONS | |||
pcre2_match_context *pcre2_match_context_create( | pcre2_match_context *pcre2_match_context_create( | |||
pcre2_general_context *gcontext); | pcre2_general_context *gcontext); | |||
pcre2_match_context *pcre2_match_context_copy( | pcre2_match_context *pcre2_match_context_copy( | |||
pcre2_match_context *mcontext); | pcre2_match_context *mcontext); | |||
void pcre2_match_context_free(pcre2_match_context *mcontext); | void pcre2_match_context_free(pcre2_match_context *mcontext); | |||
int pcre2_set_callout(pcre2_match_context *mcontext, | int pcre2_set_callout(pcre2_match_context *mcontext, | |||
int (*callout_function)(pcre2_callout_block *, void *), | int (*callout_function)(pcre2_callout_block *, void *), | |||
void *callout_data); | void *callout_data); | |||
int pcre2_set_substitute_callout(pcre2_match_context *mcontext, | int pcre2_set_substitute_callout(pcre2_match_context *mcontext, | |||
int (*callout_function)(pcre2_substitute_callout_block *, void *), | int (*callout_function)(pcre2_substitute_callout_block *, void *), | |||
void *callout_data); | void *callout_data); | |||
int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, | ||||
PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, | ||||
PCRE2_UCHAR *, PCRE2_SIZE, | ||||
int, void *), | ||||
void *callout_data); | ||||
int pcre2_set_offset_limit(pcre2_match_context *mcontext, | int pcre2_set_offset_limit(pcre2_match_context *mcontext, | |||
PCRE2_SIZE value); | PCRE2_SIZE value); | |||
int pcre2_set_heap_limit(pcre2_match_context *mcontext, | int pcre2_set_heap_limit(pcre2_match_context *mcontext, | |||
uint32_t value); | uint32_t value); | |||
int pcre2_set_match_limit(pcre2_match_context *mcontext, | int pcre2_set_match_limit(pcre2_match_context *mcontext, | |||
uint32_t value); | uint32_t value); | |||
int pcre2_set_depth_limit(pcre2_match_context *mcontext, | int pcre2_set_depth_limit(pcre2_match_context *mcontext, | |||
skipping to change at line 586 | skipping to change at line 595 | |||
A compile context is required if you want to provide an external fu nction for stack checking | A compile context is required if you want to provide an external fu nction for stack checking | |||
during compilation or to change the default values of any of the fo llowing compile-time parame‐ | during compilation or to change the default values of any of the fo llowing compile-time parame‐ | |||
ters: | ters: | |||
What \R matches (Unicode newlines or CR, LF, CRLF only) | What \R matches (Unicode newlines or CR, LF, CRLF only) | |||
PCRE2's character tables | PCRE2's character tables | |||
The newline character sequence | The newline character sequence | |||
The compile time nested parentheses limit | The compile time nested parentheses limit | |||
The maximum length of the pattern string | The maximum length of the pattern string | |||
The extra options bits (none set by default) | The extra options bits (none set by default) | |||
Which performance optimizations the compiler should apply | ||||
A compile context is also required if you are using custom memory ma nagement. If none of these | A compile context is also required if you are using custom memory ma nagement. If none of these | |||
apply, just pass NULL as the context argument of pcre2_compile(). | apply, just pass NULL as the context argument of pcre2_compile(). | |||
A compile context is created, copied, and freed by the following fun ctions: | A compile context is created, copied, and freed by the following fun ctions: | |||
pcre2_compile_context *pcre2_compile_context_create( | pcre2_compile_context *pcre2_compile_context_create( | |||
pcre2_general_context *gcontext); | pcre2_general_context *gcontext); | |||
pcre2_compile_context *pcre2_compile_context_copy( | pcre2_compile_context *pcre2_compile_context_copy( | |||
skipping to change at line 689 | skipping to change at line 699 | |||
where running out of stack is to be avoided at all costs. The pa renthesis limit above cannot | where running out of stack is to be avoided at all costs. The pa renthesis limit above cannot | |||
take account of how much stack is actually available during compilat ion. For a finer control, | take account of how much stack is actually available during compilat ion. For a finer control, | |||
you can supply a function that is called whenever pcre2_compile() s tarts to compile a parenthe‐ | you can supply a function that is called whenever pcre2_compile() s tarts to compile a parenthe‐ | |||
sized part of a pattern. This function can check the actual stack si ze (or anything else that it | sized part of a pattern. This function can check the actual stack si ze (or anything else that it | |||
wants to, of course). | wants to, of course). | |||
The first argument to the callout function gives the current depth o f nesting, and the second is | The first argument to the callout function gives the current depth o f nesting, and the second is | |||
user data that is set up by the last argument of pcre2_set_compile_r ecursion_guard(). The call‐ | user data that is set up by the last argument of pcre2_set_compile_r ecursion_guard(). The call‐ | |||
out function should return zero if all is well, or non-zero to force an error. | out function should return zero if all is well, or non-zero to force an error. | |||
int pcre2_set_optimize(pcre2_compile_context *ccontext, | ||||
uint32_t directive); | ||||
PCRE2 can apply various performance optimizations during compilatio | ||||
n, in order to make matching | ||||
faster. For example, the compiler might convert some regex construct | ||||
s into an equivalent con‐ | ||||
struct which pcre2_match() can execute faster. By default, all ava | ||||
ilable optimizations are en‐ | ||||
abled. However, in rare cases, one might wish to disable specific op | ||||
timizations. For example, if | ||||
it is known that some optimizations cannot benefit a certain regex, | ||||
it might be desirable to | ||||
disable them, in order to speed up compilation. | ||||
The permitted values of directive are as follows: | ||||
PCRE2_OPTIMIZATION_FULL | ||||
Enable all optional performance optimizations. This is the default v | ||||
alue. | ||||
PCRE2_OPTIMIZATION_NONE | ||||
Disable all optional performance optimizations. | ||||
PCRE2_AUTO_POSSESS | ||||
PCRE2_AUTO_POSSESS_OFF | ||||
Enable/disable "auto-possessification" of variable quantifiers such | ||||
as * and +. This optimiza‐ | ||||
tion, for example, turns a+b into a++b in order to avoid backtracks | ||||
into a+ that can never be | ||||
successful. However, if callouts are in use, auto-possessification | ||||
means that some callouts are | ||||
never taken. You can disable this optimization if you want the match | ||||
ing functions to do a full, | ||||
unoptimized search and run all the callouts. | ||||
PCRE2_DOTSTAR_ANCHOR | ||||
PCRE2_DOTSTAR_ANCHOR_OFF | ||||
Enable/disable an optimization that is applied when .* is the firs | ||||
t significant item in a top- | ||||
level branch of a pattern, and all the other branches also start wit | ||||
h .* or with \A or \G or ^. | ||||
Such a pattern is automatically anchored if PCRE2_DOTALL is | ||||
set for all the .* items and | ||||
PCRE2_MULTILINE is not set for any ^ items. Otherwise, the fact that | ||||
any match must start either | ||||
at the start of the subject or following a newline is remembered. Li | ||||
ke other optimizations, this | ||||
can cause callouts to be skipped. | ||||
Dotstar anchor optimization is automatically disabled for .* if it i | ||||
s inside an atomic group or | ||||
a capture group that is the subject of a backreference, or if the | ||||
pattern contains (*PRUNE) or | ||||
(*SKIP). | ||||
PCRE2_START_OPTIMIZE | ||||
PCRE2_START_OPTIMIZE_OFF | ||||
Enable/disable optimizations which cause matching functions to scan | ||||
the subject string for spe‐ | ||||
cific code unit values before attempting a match. For example, if it | ||||
is known that an unanchored | ||||
match must start with a specific value, the matching code searches | ||||
the subject for that value, | ||||
and fails immediately if it cannot find it, without actually running | ||||
the main matching function. | ||||
This means that a special item such as (*COMMIT) at the start of a p | ||||
attern is not considered un‐ | ||||
til after a suitable starting point for the match has been found. | ||||
Also, when callouts or | ||||
(*MARK) items are in use, these "start-up" optimizations can cau | ||||
se them to be skipped if the | ||||
pattern is never actually used. The start-up optimizations are in ef | ||||
fect a pre-scan of the sub‐ | ||||
ject that takes place before the pattern is run. | ||||
Disabling start-up optimizations ensures that in cases where the res | ||||
ult is "no match", the call‐ | ||||
outs do occur, and that items such as (*COMMIT) and (*MARK) are | ||||
considered at every possible | ||||
starting position in the subject string. | ||||
Disabling start-up optimizations may change the outcome of a matchin | ||||
g operation. Consider the | ||||
pattern | ||||
(*COMMIT)ABC | ||||
When this is compiled, PCRE2 records the fact that a match must s | ||||
tart with the character "A". | ||||
Suppose the subject string is "DEFABC". The start-up optimization sc | ||||
ans along the subject, finds | ||||
"A" and runs the first match attempt from there. The (*COMMIT) item | ||||
means that the pattern must | ||||
match the current starting position, which in this case, it does. H | ||||
owever, if the same match is | ||||
run without start-up optimizations, the initial scan along the subje | ||||
ct string does not happen. | ||||
The first match attempt is run starting from "D" and when this f | ||||
ails, (*COMMIT) prevents any | ||||
further matches being tried, so the overall result is "no match". | ||||
Another start-up optimization makes use of a minimum length for a m | ||||
atching subject, which is | ||||
recorded when possible. Consider the pattern | ||||
(*MARK:1)B(*MARK:2)(X|Y) | ||||
The minimum length for a match is two characters. If the subject is | ||||
"XXBB", the "starting char‐ | ||||
acter" optimization skips "XX", then tries to match "BB", which is l | ||||
ong enough. In the process, | ||||
(*MARK:2) is encountered and remembered. When the match attempt f | ||||
ails, the next "B" is found, | ||||
but there is only one character left, so there are no more attempts, | ||||
and "no match" is returned | ||||
with the "last mark seen" set to "2". Without start-up optimizations | ||||
, however, matches are tried | ||||
at every possible starting position, including at the end of the sub | ||||
ject, where (*MARK:1) is en‐ | ||||
countered, but there is no "B", so the "last mark seen" that is re | ||||
turned is "1". In this case, | ||||
the optimizations do not affect the overall match result, which is s | ||||
till "no match", but they do | ||||
affect the auxiliary information that is returned. | ||||
The match context | The match context | |||
A match context is required if you want to: | A match context is required if you want to: | |||
Set up a callout function | Set up a callout function | |||
Set an offset limit for matching an unanchored pattern | Set an offset limit for matching an unanchored pattern | |||
Change the limit on the amount of heap used when matching | Change the limit on the amount of heap used when matching | |||
Change the backtracking match limit | Change the backtracking match limit | |||
Change the backtracking depth limit | Change the backtracking depth limit | |||
Set custom memory management specifically for the match | Set custom memory management specifically for the match | |||
If none of these apply, just pass NULL as the context argument of pcre2_match(), | If none of these apply, just pass NULL as the context ar gument of pcre2_match(), | |||
pcre2_dfa_match(), or pcre2_jit_match(). | pcre2_dfa_match(), or pcre2_jit_match(). | |||
A match context is created, copied, and freed by the following funct ions: | A match context is created, copied, and freed by the following funct ions: | |||
pcre2_match_context *pcre2_match_context_create( | pcre2_match_context *pcre2_match_context_create( | |||
pcre2_general_context *gcontext); | pcre2_general_context *gcontext); | |||
pcre2_match_context *pcre2_match_context_copy( | pcre2_match_context *pcre2_match_context_copy( | |||
pcre2_match_context *mcontext); | pcre2_match_context *mcontext); | |||
void pcre2_match_context_free(pcre2_match_context *mcontext); | void pcre2_match_context_free(pcre2_match_context *mcontext); | |||
A match context is created with default values for its parameters. T hese can be changed by call‐ | A match context is created with default values for its parameters. T hese can be changed by call‐ | |||
ing the following functions, which return 0 on success, or PCRE2_ERR OR_BADDATA if invalid data | ing the following functions, which return 0 on success, or PCRE2_E RROR_BADDATA if invalid data | |||
is detected. | is detected. | |||
int pcre2_set_callout(pcre2_match_context *mcontext, | int pcre2_set_callout(pcre2_match_context *mcontext, | |||
int (*callout_function)(pcre2_callout_block *, void *), | int (*callout_function)(pcre2_callout_block *, void *), | |||
void *callout_data); | void *callout_data); | |||
This sets up a callout function for PCRE2 to call at specified poi nts during a matching opera‐ | This sets up a callout function for PCRE2 to call at specified point s during a matching opera‐ | |||
tion. Details are given in the pcre2callout documentation. | tion. Details are given in the pcre2callout documentation. | |||
int pcre2_set_substitute_callout(pcre2_match_context *mcontext, | int pcre2_set_substitute_callout(pcre2_match_context *mcontext, | |||
int (*callout_function)(pcre2_substitute_callout_block *, void *), | int (*callout_function)(pcre2_substitute_callout_block *, void *), | |||
void *callout_data); | void *callout_data); | |||
This sets up a callout function for PCRE2 to call after each substit ution made by pcre2_substi‐ | This sets up a callout function for PCRE2 to call after each substi tution made by pcre2_substi‐ | |||
tute(). Details are given in the section entitled "Creating a new st ring with substitutions" be‐ | tute(). Details are given in the section entitled "Creating a new st ring with substitutions" be‐ | |||
low. | low. | |||
int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, | ||||
PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, | ||||
PCRE2_UCHAR *, PCRE2_SIZE, | ||||
int, void *), | ||||
void *callout_data); | ||||
This sets up a callout function for PCRE2 to call when performing c | ||||
ase transformations inside | ||||
pcre2_substitute(). Details are given in the section entitled "Cre | ||||
ating a new string with sub‐ | ||||
stitutions" below. | ||||
int pcre2_set_offset_limit(pcre2_match_context *mcontext, | int pcre2_set_offset_limit(pcre2_match_context *mcontext, | |||
PCRE2_SIZE value); | PCRE2_SIZE value); | |||
The offset_limit parameter limits how far an unanchored searc | The offset_limit parameter limits how far an unanchored search ca | |||
h can advance in the subject | n advance in the subject | |||
string. The default value is PCRE2_UNSET. The pcre2_match() and pcre | string. The default value is PCRE2_UNSET. The pcre2_match() and pcr | |||
2_dfa_match() functions re‐ | e2_dfa_match() functions re‐ | |||
turn PCRE2_ERROR_NOMATCH if a match with a starting point before o | turn PCRE2_ERROR_NOMATCH if a match with a starting point before or | |||
r at the given offset is not | at the given offset is not | |||
found. The pcre2_substitute() function makes no more substitutions. | found. The pcre2_substitute() function makes no more substitutions. | |||
For example, if the pattern /abc/ is matched against "123abc" with a | For example, if the pattern /abc/ is matched against "123abc" with | |||
n offset limit less than 3, | an offset limit less than 3, | |||
the result is PCRE2_ERROR_NOMATCH. A match can never be found if | the result is PCRE2_ERROR_NOMATCH. A match can never be found if th | |||
the startoffset argument of | e startoffset argument of | |||
pcre2_match(), pcre2_dfa_match(), or pcre2_substitute() is greater t | pcre2_match(), pcre2_dfa_match(), or pcre2_substitute() is greater | |||
han the offset limit set in | than the offset limit set in | |||
the match context. | the match context. | |||
When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT op tion when calling pcre2_com‐ | When using this facility, you must set the PCRE2_USE_OFFSET_LIMIT op tion when calling pcre2_com‐ | |||
pile() so that when JIT is in use, different code can be compiled. If a match is started with a | pile() so that when JIT is in use, different code can be compiled. I f a match is started with a | |||
non-default match limit when PCRE2_USE_OFFSET_LIMIT is not set, an e rror is generated. | non-default match limit when PCRE2_USE_OFFSET_LIMIT is not set, an e rror is generated. | |||
The offset limit facility can be used to track progress when searchi ng large subject strings or | The offset limit facility can be used to track progress when search ing large subject strings or | |||
to limit the extent of global substitutions. See also the PCRE2_FIRS TLINE option, which requires | to limit the extent of global substitutions. See also the PCRE2_FIRS TLINE option, which requires | |||
a match to start before or at the first newline that follows the s | a match to start before or at the first newline that follows the sta | |||
tart of matching in the sub‐ | rt of matching in the sub‐ | |||
ject. If this is set with an offset limit, a match must occur in the | ject. If this is set with an offset limit, a match must occur in th | |||
first line and also within | e first line and also within | |||
the offset limit. In other words, whichever limit comes first is use d. | the offset limit. In other words, whichever limit comes first is use d. | |||
int pcre2_set_heap_limit(pcre2_match_context *mcontext, | int pcre2_set_heap_limit(pcre2_match_context *mcontext, | |||
uint32_t value); | uint32_t value); | |||
The heap_limit parameter specifies, in units of kibibytes (1024 b | The heap_limit parameter specifies, in units of kibibytes (1024 byte | |||
ytes), the maximum amount of | s), the maximum amount of | |||
heap memory that pcre2_match() may use to hold backtracking informat | heap memory that pcre2_match() may use to hold backtracking inform | |||
ion when running an inter‐ | ation when running an inter‐ | |||
pretive match. This limit also applies to pcre2_dfa_match(), whic | pretive match. This limit also applies to pcre2_dfa_match(), which m | |||
h may use the heap when pro‐ | ay use the heap when pro‐ | |||
cessing patterns with a lot of nested pattern recursion or lookaroun | cessing patterns with a lot of nested pattern recursion or lookar | |||
ds or atomic groups. This | ounds or atomic groups. This | |||
limit does not apply to matching with the JIT optimization, whic | limit does not apply to matching with the JIT optimization, which ha | |||
h has its own memory control | s its own memory control | |||
arrangements (see the pcre2jit documentation for more details). If t | arrangements (see the pcre2jit documentation for more details). | |||
he limit is reached, the | If the limit is reached, the | |||
negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default | negative error code PCRE2_ERROR_HEAPLIMIT is returned. The default l | |||
limit can be set when PCRE2 | imit can be set when PCRE2 | |||
is built; if it is not, the default is set very large and is essenti ally unlimited. | is built; if it is not, the default is set very large and is essenti ally unlimited. | |||
A value for the heap limit may also be supplied by an item at the st art of a pattern of the form | A value for the heap limit may also be supplied by an item at the st art of a pattern of the form | |||
(*LIMIT_HEAP=ddd) | (*LIMIT_HEAP=ddd) | |||
where ddd is a decimal number. However, such a setting is ignored un less ddd is less than the | where ddd is a decimal number. However, such a setting is ignored unless ddd is less than the | |||
limit set by the caller of pcre2_match() or, if no such limit is set , less than the default. | limit set by the caller of pcre2_match() or, if no such limit is set , less than the default. | |||
The pcre2_match() function always needs some heap memory, so settin | The pcre2_match() function always needs some heap memory, so setting | |||
g a value of zero guarantees | a value of zero guarantees | |||
a "heap limit exceeded" error. Details of how pcre2_match() uses th | a "heap limit exceeded" error. Details of how pcre2_match() us | |||
e heap are given in the | es the heap are given in the | |||
pcre2perform documentation. | pcre2perform documentation. | |||
For pcre2_dfa_match(), a vector on the system stack is used when pr | For pcre2_dfa_match(), a vector on the system stack is used when pro | |||
ocessing pattern recursions, | cessing pattern recursions, | |||
lookarounds, or atomic groups, and only if this is not big enough is | lookarounds, or atomic groups, and only if this is not big enough | |||
heap memory used. In this | is heap memory used. In this | |||
case, setting a value of zero disables the use of the heap. | case, setting a value of zero disables the use of the heap. | |||
int pcre2_set_match_limit(pcre2_match_context *mcontext, | int pcre2_set_match_limit(pcre2_match_context *mcontext, | |||
uint32_t value); | uint32_t value); | |||
The match_limit parameter provides a means of preventing PCRE2 from using up too many computing | The match_limit parameter provides a means of preventing PCRE2 from using up too many computing | |||
resources when processing patterns that are not going to match, but which have a very large num‐ | resources when processing patterns that are not going to match, but which have a very large num‐ | |||
ber of possibilities in their search trees. The classic example is a pattern that uses nested | ber of possibilities in their search trees. The classic example i s a pattern that uses nested | |||
unlimited repeats. | unlimited repeats. | |||
There is an internal counter in pcre2_match() that is incremen | There is an internal counter in pcre2_match() that is incremented | |||
ted each time round its main | each time round its main | |||
matching loop. If this value reaches the match limit, pcre2_match() | matching loop. If this value reaches the match limit, pcre2_match( | |||
returns the negative value | ) returns the negative value | |||
PCRE2_ERROR_MATCHLIMIT. This has the effect of limiting the amount o f backtracking that can take | PCRE2_ERROR_MATCHLIMIT. This has the effect of limiting the amount o f backtracking that can take | |||
place. For patterns that are not anchored, the count restarts from z ero for each position in the | place. For patterns that are not anchored, the count restarts from z ero for each position in the | |||
subject string. This limit also applies to pcre2_dfa_match(), thou gh the counting is done in a | subject string. This limit also applies to pcre2_dfa_match(), though the counting is done in a | |||
different way. | different way. | |||
When pcre2_match() is called with a pattern that was successfully p rocessed by pcre2_jit_com‐ | When pcre2_match() is called with a pattern that was successfully processed by pcre2_jit_com‐ | |||
pile(), the way in which matching is executed is entirely different. However, there is still the | pile(), the way in which matching is executed is entirely different. However, there is still the | |||
possibility of runaway matching that goes on for a very long time, and so the match_limit value | possibility of runaway matching that goes on for a very long time, a nd so the match_limit value | |||
is also used in this case (but in a different way) to limit how long the matching can continue. | is also used in this case (but in a different way) to limit how long the matching can continue. | |||
The default value for the limit can be set when PCRE2 is built; the default is 10 million, which | The default value for the limit can be set when PCRE2 is built; the default is 10 million, which | |||
handles all but the most extreme cases. A value for the match limit may also be supplied by an | handles all but the most extreme cases. A value for the match limi t may also be supplied by an | |||
item at the start of a pattern of the form | item at the start of a pattern of the form | |||
(*LIMIT_MATCH=ddd) | (*LIMIT_MATCH=ddd) | |||
where ddd is a decimal number. However, such a setting is ignored | where ddd is a decimal number. However, such a setting is ignored un | |||
unless ddd is less than the | less ddd is less than the | |||
limit set by the caller of pcre2_match() or pcre2_dfa_match() or, if | limit set by the caller of pcre2_match() or pcre2_dfa_match() or, i | |||
no such limit is set, less | f no such limit is set, less | |||
than the default. | than the default. | |||
int pcre2_set_depth_limit(pcre2_match_context *mcontext, | int pcre2_set_depth_limit(pcre2_match_context *mcontext, | |||
uint32_t value); | uint32_t value); | |||
This parameter limits the depth of nested backtracking in pcre2_ | This parameter limits the depth of nested backtracking in pcre2_matc | |||
match(). Each time a nested | h(). Each time a nested | |||
backtracking point is passed, a new memory frame is used to remember | backtracking point is passed, a new memory frame is used to remem | |||
the state of matching at | ber the state of matching at | |||
that point. Thus, this parameter indirectly limits the amount of mem ory that is used in a match. | that point. Thus, this parameter indirectly limits the amount of mem ory that is used in a match. | |||
However, because the size of each memory frame depends on the numb | However, because the size of each memory frame depends on the number | |||
er of capturing parentheses, | of capturing parentheses, | |||
the actual memory limit varies from pattern to pattern. This limit w | the actual memory limit varies from pattern to pattern. This limit | |||
as more useful in versions | was more useful in versions | |||
before 10.30, where function recursion was used for backtracking. | before 10.30, where function recursion was used for backtracking. | |||
The depth limit is not relevant, and is ignored, when matching is d one using JIT compiled code. | The depth limit is not relevant, and is ignored, when matching is do ne using JIT compiled code. | |||
However, it is supported by pcre2_dfa_match(), which uses it to limi t the depth of nested inter‐ | However, it is supported by pcre2_dfa_match(), which uses it to limi t the depth of nested inter‐ | |||
nal recursive function calls that implement atomic groups, lookaroun d assertions, and pattern | nal recursive function calls that implement atomic groups, lookar ound assertions, and pattern | |||
recursions. This limits, indirectly, the amount of system stack that is used. It was more useful | recursions. This limits, indirectly, the amount of system stack that is used. It was more useful | |||
in versions before 10.32, when stack memory was used for local wor | in versions before 10.32, when stack memory was used for local works | |||
kspace vectors for recursive | pace vectors for recursive | |||
function calls. From version 10.32, only local variables are allocat | function calls. From version 10.32, only local variables are alloca | |||
ed on the stack and as each | ted on the stack and as each | |||
call uses only a few hundred bytes, even a small stack can support q uite a lot of recursion. | call uses only a few hundred bytes, even a small stack can support q uite a lot of recursion. | |||
If the depth of internal recursive function calls is great enough, | If the depth of internal recursive function calls is great enough, l | |||
local workspace vectors are | ocal workspace vectors are | |||
allocated on the heap from version 10.32 onwards, so the depth limit | allocated on the heap from version 10.32 onwards, so the depth limi | |||
also indirectly limits the | t also indirectly limits the | |||
amount of heap memory that is used. A recursive pattern such as /( | amount of heap memory that is used. A recursive pattern such as /(.( | |||
.(?2))((?1)|)/, when matched | ?2))((?1)|)/, when matched | |||
to a very long string using pcre2_dfa_match(), can use a great deal | to a very long string using pcre2_dfa_match(), can use a great de | |||
of memory. However, it is | al of memory. However, it is | |||
probably better to limit heap usage directly by calling pcre2_set_he ap_limit(). | probably better to limit heap usage directly by calling pcre2_set_he ap_limit(). | |||
The default value for the depth limit can be set when PCRE2 is buil | The default value for the depth limit can be set when PCRE2 is built | |||
t; if it is not, the default | ; if it is not, the default | |||
is set to the same value as the default for the match limit. I | is set to the same value as the default for the match limit | |||
f the limit is exceeded, | . If the limit is exceeded, | |||
pcre2_match() or pcre2_dfa_match() returns PCRE2_ERROR_DEPTHLIMIT. | pcre2_match() or pcre2_dfa_match() returns PCRE2_ERROR_DEPTHLIMIT. A | |||
A value for the depth limit | value for the depth limit | |||
may also be supplied by an item at the start of a pattern of the for m | may also be supplied by an item at the start of a pattern of the for m | |||
(*LIMIT_DEPTH=ddd) | (*LIMIT_DEPTH=ddd) | |||
where ddd is a decimal number. However, such a setting is ignored un | where ddd is a decimal number. However, such a setting is ignored | |||
less ddd is less than the | unless ddd is less than the | |||
limit set by the caller of pcre2_match() or pcre2_dfa_match() or, i | limit set by the caller of pcre2_match() or pcre2_dfa_match() or, if | |||
f no such limit is set, less | no such limit is set, less | |||
than the default. | than the default. | |||
CHECKING BUILD-TIME OPTIONS | CHECKING BUILD-TIME OPTIONS | |||
int pcre2_config(uint32_t what, void *where); | int pcre2_config(uint32_t what, void *where); | |||
The function pcre2_config() makes it possible for a PCRE2 client to | The function pcre2_config() makes it possible for a PCRE2 client | |||
find the value of certain | to find the value of certain | |||
configuration parameters and to discover which optional features | configuration parameters and to discover which optional features hav | |||
have been compiled into the | e been compiled into the | |||
PCRE2 library. The pcre2build documentation has more details about t hese features. | PCRE2 library. The pcre2build documentation has more details about t hese features. | |||
The first argument for pcre2_config() specifies which information is | The first argument for pcre2_config() specifies which information i | |||
required. The second argu‐ | s required. The second argu‐ | |||
ment is a pointer to memory into which the information is placed. | ment is a pointer to memory into which the information is placed. If | |||
If NULL is passed, the func‐ | NULL is passed, the func‐ | |||
tion returns the amount of memory that is needed for the requested i | tion returns the amount of memory that is needed for the requested | |||
nformation. For calls that | information. For calls that | |||
return numerical values, the value is in bytes; when requesting thes e values, where should point | return numerical values, the value is in bytes; when requesting thes e values, where should point | |||
to appropriately aligned memory. For calls that return strings, the required length is given in | to appropriately aligned memory. For calls that return strings, the required length is given in | |||
code units, not counting the terminating zero. | code units, not counting the terminating zero. | |||
When requesting information, the returned value from pcre2_config() | When requesting information, the returned value from pcre2_config() | |||
is non-negative on success, | is non-negative on success, | |||
or the negative error code PCRE2_ERROR_BADOPTION if the value in th | or the negative error code PCRE2_ERROR_BADOPTION if the value in the | |||
e first argument is not rec‐ | first argument is not rec‐ | |||
ognized. The following information is available: | ognized. The following information is available: | |||
PCRE2_CONFIG_BSR | PCRE2_CONFIG_BSR | |||
The output is a uint32_t integer whose value indicates what characte r sequences the \R escape | The output is a uint32_t integer whose value indicates what chara cter sequences the \R escape | |||
sequence matches by default. A value of PCRE2_BSR_UNICODE means that \R matches any Unicode line | sequence matches by default. A value of PCRE2_BSR_UNICODE means that \R matches any Unicode line | |||
ending sequence; a value of PCRE2_BSR_ANYCRLF means that \R match es only CR, LF, or CRLF. The | ending sequence; a value of PCRE2_BSR_ANYCRLF means that \R matches only CR, LF, or CRLF. The | |||
default can be overridden when a pattern is compiled. | default can be overridden when a pattern is compiled. | |||
PCRE2_CONFIG_COMPILED_WIDTHS | PCRE2_CONFIG_COMPILED_WIDTHS | |||
The output is a uint32_t integer whose lower bits indicate which cod e unit widths were selected | The output is a uint32_t integer whose lower bits indicate which co de unit widths were selected | |||
when PCRE2 was built. The 1-bit indicates 8-bit support, and the 2-b it and 4-bit indicate 16-bit | when PCRE2 was built. The 1-bit indicates 8-bit support, and the 2-b it and 4-bit indicate 16-bit | |||
and 32-bit support, respectively. | and 32-bit support, respectively. | |||
PCRE2_CONFIG_DEPTHLIMIT | PCRE2_CONFIG_DEPTHLIMIT | |||
The output is a uint32_t integer that gives the default limit for th e depth of nested backtrack‐ | The output is a uint32_t integer that gives the default limit for th e depth of nested backtrack‐ | |||
ing in pcre2_match() or the depth of nested recursions, looka rounds, and atomic groups in | ing in pcre2_match() or the depth of nested recursions, lookaroun ds, and atomic groups in | |||
pcre2_dfa_match(). Further details are given with pcre2_set_depth_li mit() above. | pcre2_dfa_match(). Further details are given with pcre2_set_depth_li mit() above. | |||
PCRE2_CONFIG_HEAPLIMIT | PCRE2_CONFIG_HEAPLIMIT | |||
The output is a uint32_t integer that gives, in kibibytes, the defau | The output is a uint32_t integer that gives, in kibibytes, the def | |||
lt limit for the amount of | ault limit for the amount of | |||
heap memory used by pcre2_match() or pcre2_dfa_match(). Furth | heap memory used by pcre2_match() or pcre2_dfa_match(). Furthe | |||
er details are given with | r details are given with | |||
pcre2_set_heap_limit() above. | pcre2_set_heap_limit() above. | |||
PCRE2_CONFIG_JIT | PCRE2_CONFIG_JIT | |||
The output is a uint32_t integer that is set to one if support for j ust-in-time compiling is in‐ | The output is a uint32_t integer that is set to one if support for j ust-in-time compiling is in‐ | |||
cluded in the library; otherwise it is set to zero. Note that having | cluded in the library; otherwise it is set to zero. Note that havin | |||
the support in the library | g the support in the library | |||
does not guarantee that JIT will be used for any given match. See th | does not guarantee that JIT will be used for any given match, and ne | |||
e pcre2jit documentation for | ither does it guarantee that | |||
more details. | JIT will actually be able to function, because it may not be able to | |||
allocate executable memory | ||||
in some environments. There is a special call to pcre2_jit_compile | ||||
() that can be used to check | ||||
this. See the pcre2jit documentation for more details. | ||||
PCRE2_CONFIG_JITTARGET | PCRE2_CONFIG_JITTARGET | |||
The where argument should point to a buffer that is at least 48 | The where argument should point to a buffer that is at least 48 cod | |||
code units long. (The exact | e units long. (The exact | |||
length required can be found by calling pcre2_config() with where se | length required can be found by calling pcre2_config() with where | |||
t to NULL.) The buffer is | set to NULL.) The buffer is | |||
filled with a string that contains the name of the architecture f | filled with a string that contains the name of the architecture for | |||
or which the JIT compiler is | which the JIT compiler is | |||
configured, for example "x86 32bit (little endian + unaligned)". If | configured, for example "x86 32bit (little endian + unaligned)". | |||
JIT support is not avail‐ | If JIT support is not avail‐ | |||
able, PCRE2_ERROR_BADOPTION is returned, otherwise the number of | able, PCRE2_ERROR_BADOPTION is returned, otherwise the number of cod | |||
code units used is returned. | e units used is returned. | |||
This is the length of the string, plus one unit for the terminating zero. | This is the length of the string, plus one unit for the terminating zero. | |||
PCRE2_CONFIG_LINKSIZE | PCRE2_CONFIG_LINKSIZE | |||
The output is a uint32_t integer that contains the number of bytes u sed for internal linkage in | The output is a uint32_t integer that contains the number of bytes used for internal linkage in | |||
compiled regular expressions. When PCRE2 is configured, the value ca n be set to 2, 3, or 4, with | compiled regular expressions. When PCRE2 is configured, the value ca n be set to 2, 3, or 4, with | |||
the default being 2. This is the value that is returned by pcre2 _config(). However, when the | the default being 2. This is the value that is returned by pcre2_co nfig(). However, when the | |||
16-bit library is compiled, a value of 3 is rounded up to 4, and whe n the 32-bit library is com‐ | 16-bit library is compiled, a value of 3 is rounded up to 4, and whe n the 32-bit library is com‐ | |||
piled, internal linkages always use 4 bytes, so the configured value is not relevant. | piled, internal linkages always use 4 bytes, so the configured value is not relevant. | |||
The default value of 2 for the 8-bit and 16-bit libraries is suffici ent for all but the most | The default value of 2 for the 8-bit and 16-bit libraries is suf ficient for all but the most | |||
massive patterns, since it allows the size of the compiled pattern t o be up to 65535 code units. | massive patterns, since it allows the size of the compiled pattern t o be up to 65535 code units. | |||
Larger values allow larger regular expressions to be compiled by tho se two libraries, but at the | Larger values allow larger regular expressions to be compiled by tho se two libraries, but at the | |||
expense of slower matching. | expense of slower matching. | |||
PCRE2_CONFIG_MATCHLIMIT | PCRE2_CONFIG_MATCHLIMIT | |||
The output is a uint32_t integer that gives the default match limi t for pcre2_match(). Further | The output is a uint32_t integer that gives the default match limit for pcre2_match(). Further | |||
details are given with pcre2_set_match_limit() above. | details are given with pcre2_set_match_limit() above. | |||
PCRE2_CONFIG_NEWLINE | PCRE2_CONFIG_NEWLINE | |||
The output is a uint32_t integer whose value specifies the default c haracter sequence that is | The output is a uint32_t integer whose value specifies the defaul t character sequence that is | |||
recognized as meaning "newline". The values are: | recognized as meaning "newline". The values are: | |||
PCRE2_NEWLINE_CR Carriage return (CR) | PCRE2_NEWLINE_CR Carriage return (CR) | |||
PCRE2_NEWLINE_LF Linefeed (LF) | PCRE2_NEWLINE_LF Linefeed (LF) | |||
PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) | PCRE2_NEWLINE_CRLF Carriage return, linefeed (CRLF) | |||
PCRE2_NEWLINE_ANY Any Unicode line ending | PCRE2_NEWLINE_ANY Any Unicode line ending | |||
PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF | PCRE2_NEWLINE_ANYCRLF Any of CR, LF, or CRLF | |||
PCRE2_NEWLINE_NUL The NUL character (binary zero) | PCRE2_NEWLINE_NUL The NUL character (binary zero) | |||
The default should normally correspond to the standard sequence for your operating system. | The default should normally correspond to the standard sequence for your operating system. | |||
PCRE2_CONFIG_NEVER_BACKSLASH_C | PCRE2_CONFIG_NEVER_BACKSLASH_C | |||
The output is a uint32_t integer that is set to one if the use of \C was permanently disabled | The output is a uint32_t integer that is set to one if the use of \C was permanently disabled | |||
when PCRE2 was built; otherwise it is set to zero. | when PCRE2 was built; otherwise it is set to zero. | |||
PCRE2_CONFIG_PARENSLIMIT | PCRE2_CONFIG_PARENSLIMIT | |||
The output is a uint32_t integer that gives the maximum depth of nes | The output is a uint32_t integer that gives the maximum depth of ne | |||
ting of parentheses (of any | sting of parentheses (of any | |||
kind) in a pattern. This limit is imposed to cap the amount of syst | kind) in a pattern. This limit is imposed to cap the amount of syste | |||
em stack used when a pattern | m stack used when a pattern | |||
is compiled. It is specified when PCRE2 is built; the default is 250 | is compiled. It is specified when PCRE2 is built; the default is 2 | |||
. This limit does not take | 50. This limit does not take | |||
into account the stack that may already be used by the calling ap | into account the stack that may already be used by the calling appli | |||
plication. For finer control | cation. For finer control | |||
over compilation stack usage, see pcre2_set_compile_recursion_guard( ). | over compilation stack usage, see pcre2_set_compile_recursion_guard( ). | |||
PCRE2_CONFIG_STACKRECURSE | PCRE2_CONFIG_STACKRECURSE | |||
This parameter is obsolete and should not be used in new code. The o utput is a uint32_t integer | This parameter is obsolete and should not be used in new code. The output is a uint32_t integer | |||
that is always set to zero. | that is always set to zero. | |||
PCRE2_CONFIG_TABLES_LENGTH | PCRE2_CONFIG_TABLES_LENGTH | |||
The output is a uint32_t integer that gives the length of PCRE2's ch aracter processing tables in | The output is a uint32_t integer that gives the length of PCRE2's ch aracter processing tables in | |||
bytes. For details of these tables see the section on locale support below. | bytes. For details of these tables see the section on locale support below. | |||
PCRE2_CONFIG_UNICODE_VERSION | PCRE2_CONFIG_UNICODE_VERSION | |||
The where argument should point to a buffer that is at least 24 | The where argument should point to a buffer that is at least 24 cod | |||
code units long. (The exact | e units long. (The exact | |||
length required can be found by calling pcre2_config() with where se | length required can be found by calling pcre2_config() with wher | |||
t to NULL.) If PCRE2 has | e set to NULL.) If PCRE2 has | |||
been compiled without Unicode support, the buffer is filled wit | been compiled without Unicode support, the buffer is filled with th | |||
h the text "Unicode not sup‐ | e text "Unicode not sup‐ | |||
ported". Otherwise, the Unicode version string (for example, "8.0.0" ) is inserted. The number of | ported". Otherwise, the Unicode version string (for example, "8.0.0" ) is inserted. The number of | |||
code units used is returned. This is the length of the string plus o ne unit for the terminating | code units used is returned. This is the length of the string plus one unit for the terminating | |||
zero. | zero. | |||
PCRE2_CONFIG_UNICODE | PCRE2_CONFIG_UNICODE | |||
The output is a uint32_t integer that is set to one if Unicode sup port is available; otherwise | The output is a uint32_t integer that is set to one if Unicode suppo rt is available; otherwise | |||
it is set to zero. Unicode support implies UTF support. | it is set to zero. Unicode support implies UTF support. | |||
PCRE2_CONFIG_VERSION | PCRE2_CONFIG_VERSION | |||
The where argument should point to a buffer that is at least 24 cod | The where argument should point to a buffer that is at least 24 | |||
e units long. (The exact | code units long. (The exact | |||
length required can be found by calling pcre2_config() with where | length required can be found by calling pcre2_config() with where se | |||
set to NULL.) The buffer is | t to NULL.) The buffer is | |||
filled with the PCRE2 version string, zero-terminated. The number of | filled with the PCRE2 version string, zero-terminated. The numb | |||
code units used is re‐ | er of code units used is re‐ | |||
turned. This is the length of the string plus one unit for the termi nating zero. | turned. This is the length of the string plus one unit for the termi nating zero. | |||
COMPILING A PATTERN | COMPILING A PATTERN | |||
pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, | pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length, | |||
uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, | uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset, | |||
pcre2_compile_context *ccontext); | pcre2_compile_context *ccontext); | |||
void pcre2_code_free(pcre2_code *code); | void pcre2_code_free(pcre2_code *code); | |||
pcre2_code *pcre2_code_copy(const pcre2_code *code); | pcre2_code *pcre2_code_copy(const pcre2_code *code); | |||
pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code); | pcre2_code *pcre2_code_copy_with_tables(const pcre2_code *code); | |||
The pcre2_compile() function compiles a pattern into an internal f orm. The pattern is defined | The pcre2_compile() function compiles a pattern into an internal for m. The pattern is defined | |||
by a pointer to a string of code units and a length in code units. I f the pattern is zero-termi‐ | by a pointer to a string of code units and a length in code units. I f the pattern is zero-termi‐ | |||
nated, the length can be specified as PCRE2_ZERO_TERMINATED. A NUL | nated, the length can be specified as PCRE2_ZERO_TERMINATED. A | |||
L pattern pointer with a | NULL pattern pointer with a | |||
length of zero is treated as an empty string (NULL with a non-zer | length of zero is treated as an empty string (NULL with a non-zero l | |||
o length causes an error re‐ | ength causes an error re‐ | |||
turn). The function returns a pointer to a block of memory that cont | turn). The function returns a pointer to a block of memory that c | |||
ains the compiled pattern | ontains the compiled pattern | |||
and related data, or NULL if an error occurred. | and related data, or NULL if an error occurred. | |||
If the compile context argument ccontext is NULL, memory for the com piled pattern is obtained by | If the compile context argument ccontext is NULL, memory for the com piled pattern is obtained by | |||
calling malloc(). Otherwise, it is obtained from the same memory fu | calling malloc(). Otherwise, it is obtained from the same memory fun | |||
nction that was used for the | ction that was used for the | |||
compile context. The caller must free the memory by calling pcre2_c | compile context. The caller must free the memory by calling pcr | |||
ode_free() when it is no | e2_code_free() when it is no | |||
longer needed. If pcre2_code_free() is called with a NULL argum | longer needed. If pcre2_code_free() is called with a NULL argument | |||
ent, it returns immediately, | , it returns immediately, | |||
without doing anything. | without doing anything. | |||
The function pcre2_code_copy() makes a copy of the compiled code in | The function pcre2_code_copy() makes a copy of the compiled code i | |||
new memory, using the same | n new memory, using the same | |||
memory allocator as was used for the original. However, if the co | memory allocator as was used for the original. However, if the code | |||
de has been processed by the | has been processed by the | |||
JIT compiler (see below), the JIT information cannot be copied (beca | JIT compiler (see below), the JIT information cannot be copied (b | |||
use it is position-depen‐ | ecause it is position-depen‐ | |||
dent). The new copy can initially be used only for non-JIT matching , though it can be passed to | dent). The new copy can initially be used only for non-JIT matching , though it can be passed to | |||
pcre2_jit_compile() if required. If pcre2_code_copy() is called with a NULL argument, it returns | pcre2_jit_compile() if required. If pcre2_code_copy() is called with a NULL argument, it returns | |||
NULL. | NULL. | |||
The pcre2_code_copy() function provides a way for individual threads in a multithreaded applica‐ | The pcre2_code_copy() function provides a way for individual threads in a multithreaded applica‐ | |||
tion to acquire a private copy of shared compiled code. However, it does not make a copy of the | tion to acquire a private copy of shared compiled code. However, it does not make a copy of the | |||
character tables used by the compiled pattern; the new pattern code points to the same tables as | character tables used by the compiled pattern; the new pattern code points to the same tables as | |||
the original code. (See "Locale Support" below for details of thes | the original code. (See "Locale Support" below for details of these | |||
e character tables.) In many | character tables.) In many | |||
applications the same tables are used throughout, so this behaviour | applications the same tables are used throughout, so this behavio | |||
is appropriate. Neverthe‐ | ur is appropriate. Neverthe‐ | |||
less, there are occasions when a copy of a compiled pattern and the | less, there are occasions when a copy of a compiled pattern and the | |||
relevant tables are needed. | relevant tables are needed. | |||
The pcre2_code_copy_with_tables() provides this facility. Copies of | The pcre2_code_copy_with_tables() provides this facility. Copies | |||
both the code and the ta‐ | of both the code and the ta‐ | |||
bles are made, with the new code pointing to the new tables. The | bles are made, with the new code pointing to the new tables. The mem | |||
memory for the new tables is | ory for the new tables is | |||
automatically freed when pcre2_code_free() is called for the new cop | automatically freed when pcre2_code_free() is called for the new c | |||
y of the compiled code. If | opy of the compiled code. If | |||
pcre2_code_copy_with_tables() is called with a NULL argument, it ret urns NULL. | pcre2_code_copy_with_tables() is called with a NULL argument, it ret urns NULL. | |||
NOTE: When one of the matching functions is called, pointers to | NOTE: When one of the matching functions is called, pointers to the | |||
the compiled pattern and the | compiled pattern and the | |||
subject string are set in the match data block so that they can be r | subject string are set in the match data block so that they can be | |||
eferenced by the substring | referenced by the substring | |||
extraction functions after a successful match. After running a mat | extraction functions after a successful match. After running a matc | |||
ch, you must not free a com‐ | h, you must not free a com‐ | |||
piled pattern or a subject string until after all operations on the | piled pattern or a subject string until after all operations on the | |||
match data block have taken | match data block have taken | |||
place, unless, in the case of the subject string, you have used th | place, unless, in the case of the subject string, you have used the | |||
e PCRE2_COPY_MATCHED_SUBJECT | PCRE2_COPY_MATCHED_SUBJECT | |||
option, which is described in the section entitled "Option bits for pcre2_match()" below. | option, which is described in the section entitled "Option bits for pcre2_match()" below. | |||
The options argument for pcre2_compile() contains various bit settin gs that affect the compila‐ | The options argument for pcre2_compile() contains various bit setti ngs that affect the compila‐ | |||
tion. It should be zero if none of them are required. The available options are described below. | tion. It should be zero if none of them are required. The available options are described below. | |||
Some of them (in particular, those that are compatible with Perl, | Some of them (in particular, those that are compatible with Perl, bu | |||
but some others as well) can | t some others as well) can | |||
also be set and unset from within the pattern (see the detailed desc | also be set and unset from within the pattern (see the detailed des | |||
ription in the pcre2pattern | cription in the pcre2pattern | |||
documentation). | documentation). | |||
For those options that can be different in different parts of the | For those options that can be different in different parts of the pa | |||
pattern, the contents of the | ttern, the contents of the | |||
options argument specifies their settings at the start of compil | options argument specifies their settings at the start of comp | |||
ation. The PCRE2_ANCHORED, | ilation. The PCRE2_ANCHORED, | |||
PCRE2_ENDANCHORED, and PCRE2_NO_UTF_CHECK options can be set at the | PCRE2_ENDANCHORED, and PCRE2_NO_UTF_CHECK options can be set at the | |||
time of matching as well as | time of matching as well as | |||
at compile time. | at compile time. | |||
Some additional options and less frequently required compile-time pa rameters (for example, the | Some additional options and less frequently required compile-time parameters (for example, the | |||
newline setting) can be provided in a compile context (as described above). | newline setting) can be provided in a compile context (as described above). | |||
If errorcode or erroroffset is NULL, pcre2_compile() returns NULL | If errorcode or erroroffset is NULL, pcre2_compile() returns NULL i | |||
immediately. Otherwise, the | mmediately. Otherwise, the | |||
variables to which these point are set to an error code and an offse | variables to which these point are set to an error code and an o | |||
t (number of code units) | ffset (number of code units) | |||
within the pattern, respectively, when pcre2_compile() returns NULL | within the pattern, respectively, when pcre2_compile() returns NULL | |||
because a compilation error | because a compilation error | |||
has occurred. | has occurred. | |||
There are nearly 100 positive error codes that pcre2_compile() may r | There are over 100 positive error codes that pcre2_compile() may re | |||
eturn if it finds an error | turn if it finds an error in | |||
in the pattern. There are also some negative error codes that are | the pattern. There are also some negative error codes that are used | |||
used for invalid UTF strings | for invalid UTF strings when | |||
when validity checking is in force. These are the same as g | validity checking is in force. These are the same as give | |||
iven by pcre2_match() and | n by pcre2_match() and | |||
pcre2_dfa_match(), and are described in the pcre2unicode documen tation. There is no separate | pcre2_dfa_match(), and are described in the pcre2unicode documen tation. There is no separate | |||
documentation for the positive error codes, because the textual erro r messages that are obtained | documentation for the positive error codes, because the textual erro r messages that are obtained | |||
by calling the pcre2_get_error_message() function (see "Obtaining a textual error message" be‐ | by calling the pcre2_get_error_message() function (see "Obtaining a textual error message" be‐ | |||
low) should be self-explanatory. Macro names starting with PCRE2 _ERROR_ are defined for both | low) should be self-explanatory. Macro names starting with PCRE2 _ERROR_ are defined for both | |||
positive and negative error codes in pcre2.h. When compilation is su ccessful errorcode is set to | positive and negative error codes in pcre2.h. When compilation is su ccessful errorcode is set to | |||
a value that returns the message "no error" if passed to pcre2_get_e rror_message(). | a value that returns the message "no error" if passed to pcre2_get_e rror_message(). | |||
The value returned in erroroffset is an indication of where in the p attern an error occurred. | The value returned in erroroffset is an indication of where in the p attern an error occurred. | |||
When there is no error, zero is returned. A non-zero value is not ne cessarily the furthest point | When there is no error, zero is returned. A non-zero value is not ne cessarily the furthest point | |||
in the pattern that was read. For example, after the error "lookb ehind assertion is not fixed | in the pattern that was read. For example, after the error "lookb ehind assertion is not fixed | |||
skipping to change at line 1132 | skipping to change at line 1242 | |||
placement strings passed to pcre2_substitute(). | placement strings passed to pcre2_substitute(). | |||
PCRE2_ALT_CIRCUMFLEX | PCRE2_ALT_CIRCUMFLEX | |||
In multiline mode (when PCRE2_MULTILINE is set), the circumflex met acharacter matches at the | In multiline mode (when PCRE2_MULTILINE is set), the circumflex met acharacter matches at the | |||
start of the subject (unless PCRE2_NOTBOL is set), and also after an y internal newline. However, | start of the subject (unless PCRE2_NOTBOL is set), and also after an y internal newline. However, | |||
it does not match after a newline at the end of the subject, for com patibility with Perl. If you | it does not match after a newline at the end of the subject, for com patibility with Perl. If you | |||
want a multiline circumflex also to match after a terminati ng newline, you must set | want a multiline circumflex also to match after a terminati ng newline, you must set | |||
PCRE2_ALT_CIRCUMFLEX. | PCRE2_ALT_CIRCUMFLEX. | |||
PCRE2_ALT_EXTENDED_CLASS | ||||
Alters the parsing of character classes to follow the extended sy | ||||
ntax described by Unicode | ||||
UTS#18. The PCRE2_ALT_EXTENDED_CLASS option has no impact on the be | ||||
haviour of the Perl-specific | ||||
"(?[...])" syntax for extended classes, but instead enables the alte | ||||
rnative syntax of extended | ||||
class behaviour inside ordinary "[...]" character classes. See th | ||||
e pcre2pattern documentation | ||||
for details of the character classes supported. | ||||
PCRE2_ALT_VERBNAMES | PCRE2_ALT_VERBNAMES | |||
By default, for compatibility with Perl, the name in any verb sequen ce such as (*MARK:NAME) is | By default, for compatibility with Perl, the name in any verb sequen ce such as (*MARK:NAME) is | |||
any sequence of characters that does not include a closing parenthesis. The name is not | any sequence of characters that does not include a closing parenthesis. The name is not | |||
processed in any way, and it is not possible to include a closing pa renthesis in the name. How‐ | processed in any way, and it is not possible to include a closing pa renthesis in the name. How‐ | |||
ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash p rocessing is applied to verb | ever, if the PCRE2_ALT_VERBNAMES option is set, normal backslash p rocessing is applied to verb | |||
names and only an unescaped closing parenthesis terminates the name. A closing parenthesis can | names and only an unescaped closing parenthesis terminates the name. A closing parenthesis can | |||
be included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED or PCRE2_EX‐ | be included in a name either as \) or between \Q and \E. If the PCRE2_EXTENDED or PCRE2_EX‐ | |||
TENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped whi tespace in verb names is | TENDED_MORE option is set with PCRE2_ALT_VERBNAMES, unescaped whi tespace in verb names is | |||
skipped and #-comments are recognized, exactly as in the rest of the pattern. | skipped and #-comments are recognized, exactly as in the rest of the pattern. | |||
skipping to change at line 1155 | skipping to change at line 1273 | |||
If this bit is set, pcre2_compile() automatically inserts callout items, all with number 255, | If this bit is set, pcre2_compile() automatically inserts callout items, all with number 255, | |||
before each pattern item, except immediately before or after an expl icit callout in the pattern. | before each pattern item, except immediately before or after an expl icit callout in the pattern. | |||
For discussion of the callout facility, see the pcre2callout documen tation. | For discussion of the callout facility, see the pcre2callout documen tation. | |||
PCRE2_CASELESS | PCRE2_CASELESS | |||
If this bit is set, letters in the pattern match both upper and lowe r case letters in the sub‐ | If this bit is set, letters in the pattern match both upper and lowe r case letters in the sub‐ | |||
ject. It is equivalent to Perl's /i option, and it can be changed wi thin a pattern by a (?i) op‐ | ject. It is equivalent to Perl's /i option, and it can be changed wi thin a pattern by a (?i) op‐ | |||
tion setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode prope rties are used for all char‐ | tion setting. If either PCRE2_UTF or PCRE2_UCP is set, Unicode prope rties are used for all char‐ | |||
acters with more than one other case, and for all characters whose code points are greater than | acters with more than one other case, and for all characters whose code points are greater than | |||
U+007F. Note that there are two ASCII characters, K and S, that, in | U+007F. | |||
addition to their lower case | ||||
ASCII equivalents, are case-equivalent with U+212A (Kelvin sign) and | Note that there are two ASCII characters, K and S, that, in addition | |||
U+017F (long S) respec‐ | to their lower case ASCII | |||
tively. If you do not want this case equivalence, you can supp | equivalents, are case-equivalent with U+212A (Kelvin sign) and U+01 | |||
ress it by setting PCRE2_EX‐ | 7F (long S) respectively. If | |||
TRA_CASELESS_RESTRICT. | you do not want this case equivalence, you can suppress it by setti | |||
ng PCRE2_EXTRA_CASELESS_RE‐ | ||||
For lower valued characters with only one other case, a lookup table | STRICT. | |||
is used for speed. When | ||||
neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used f | One language family, Turkish and Azeri, has its own case-insensiti | |||
or all code points less than | vity rules, which can be se‐ | |||
256, and higher code points (available only in 16-bit or 32-bit mode | lected by setting PCRE2_EXTRA_TURKISH_CASING. This alters the behavi | |||
) are treated as not having | our of the 'i', 'I', U+0130 | |||
(capital I with dot above), and U+0131 (small dotless i) characters. | ||||
For lower valued characters with only one other case, a lookup t | ||||
able is used for speed. When | ||||
neither PCRE2_UTF nor PCRE2_UCP is set, a lookup table is used for a | ||||
ll code points less than | ||||
256, and higher code points (available only in 16-bit or 32-bit mod | ||||
e) are treated as not having | ||||
another case. | another case. | |||
From release 10.45 PCRE2_CASELESS also affects what some of the lett | ||||
er-related Unicode property | ||||
escapes (\p and \P) match. The properties Lu (upper case letter), Ll | ||||
(lower case letter), and Lt | ||||
(title case letter) are all treated as LC (cased letter) when PCRE2_ | ||||
CASELESS is set. | ||||
PCRE2_DOLLAR_ENDONLY | PCRE2_DOLLAR_ENDONLY | |||
If this bit is set, a dollar metacharacter in the pattern matches on ly at the end of the subject | If this bit is set, a dollar metacharacter in the pattern matches on ly at the end of the subject | |||
string. Without this option, a dollar also matches immediately be fore a newline at the end of | string. Without this option, a dollar also matches immediately be fore a newline at the end of | |||
the string (but not before any other newlines). The PCRE2_DOLLAR_END ONLY option is ignored if | the string (but not before any other newlines). The PCRE2_DOLLAR_END ONLY option is ignored if | |||
PCRE2_MULTILINE is set. There is no equivalent to this option i n Perl, and no way to set it | PCRE2_MULTILINE is set. There is no equivalent to this option i n Perl, and no way to set it | |||
within a pattern. | within a pattern. | |||
PCRE2_DOTALL | PCRE2_DOTALL | |||
skipping to change at line 1325 | skipping to change at line 1453 | |||
ing point in the middle of a multi-code-unit character. This opti on may be useful in applica‐ | ing point in the middle of a multi-code-unit character. This opti on may be useful in applica‐ | |||
tions that process patterns from external sources. Note that there i s also a build-time option | tions that process patterns from external sources. Note that there i s also a build-time option | |||
that permanently locks out the use of \C. | that permanently locks out the use of \C. | |||
PCRE2_NEVER_UCP | PCRE2_NEVER_UCP | |||
This option locks out the use of Unicode properties for handling \B, \b, \D, \d, \S, \s, \W, \w, | This option locks out the use of Unicode properties for handling \B, \b, \D, \d, \S, \s, \W, \w, | |||
and some of the POSIX character classes, as described for the PCRE2_ UCP option below. In partic‐ | and some of the POSIX character classes, as described for the PCRE2_ UCP option below. In partic‐ | |||
ular, it prevents the creator of the pattern from enabling this faci lity by starting the pattern | ular, it prevents the creator of the pattern from enabling this faci lity by starting the pattern | |||
with (*UCP). This option may be useful in applications that pr ocess patterns from external | with (*UCP). This option may be useful in applications that pr ocess patterns from external | |||
sources. The option combination PCRE_UCP and PCRE_NEVER_UCP causes a n error. | sources. The option combination PCRE2_UCP and PCRE2_NEVER_UCP causes an error. | |||
PCRE2_NEVER_UTF | PCRE2_NEVER_UTF | |||
This option locks out interpretation of the pattern as UTF-8, UTF-16 , or UTF-32, depending on | This option locks out interpretation of the pattern as UTF-8, UTF-16 , or UTF-32, depending on | |||
which library is in use. In particular, it prevents the creator of t he pattern from switching to | which library is in use. In particular, it prevents the creator of t he pattern from switching to | |||
UTF interpretation by starting the pattern with (*UTF). This opti on may be useful in applica‐ | UTF interpretation by starting the pattern with (*UTF). This opti on may be useful in applica‐ | |||
tions that process patterns from external sources. The combi nation of PCRE2_UTF and | tions that process patterns from external sources. The combi nation of PCRE2_UTF and | |||
PCRE2_NEVER_UTF causes an error. | PCRE2_NEVER_UTF causes an error. | |||
PCRE2_NO_AUTO_CAPTURE | PCRE2_NO_AUTO_CAPTURE | |||
If this option is set, it disables the use of numbered capturing par entheses in the pattern. Any | If this option is set, it disables the use of numbered capturing par entheses in the pattern. Any | |||
opening parenthesis that is not followed by ? behaves as if it w ere followed by ?: but named | opening parenthesis that is not followed by ? behaves as if it w ere followed by ?: but named | |||
parentheses can still be used for capturing (and they acquire number s in the usual way). This is | parentheses can still be used for capturing (and they acquire number s in the usual way). This is | |||
the same as Perl's /n option. Note that, when this option is set, r eferences to capture groups | the same as Perl's /n option. Note that, when this option is set, r eferences to capture groups | |||
(backreferences or recursion/subroutine calls) may only refer to nam ed groups, though the refer‐ | (backreferences or recursion/subroutine calls) may only refer to nam ed groups, though the refer‐ | |||
ence can be by name or by number. | ence can be by name or by number. | |||
PCRE2_NO_AUTO_POSSESS | PCRE2_NO_AUTO_POSSESS | |||
If this option is set, it disables "auto-possessification", which | If this (deprecated) option is set, it disables "auto-possessifica | |||
is an optimization that, for | tion", which is an optimiza‐ | |||
example, turns a+b into a++b in order to avoid backtracks into a+ th | tion that, for example, turns a+b into a++b in order to avoid backtr | |||
at can never be successful. | acks into a+ that can never | |||
However, if callouts are in use, auto-possessification means that so | be successful. However, if callouts are in use, auto-possessificat | |||
me callouts are never taken. | ion means that some callouts | |||
You can set this option if you want the matching functions to do a | are never taken. You can set this option if you want the matching fu | |||
full unoptimized search and | nctions to do a full unopti‐ | |||
run all the callouts, but it is mainly provided for testing purposes | mized search and run all the callouts, but it is mainly provided for | |||
. | testing purposes. | |||
If a compile context is available, it is recommended to use pcre2_se | ||||
t_optimize() with the direc‐ | ||||
tive PCRE2_AUTO_POSSESS_OFF rather than the compile option PCRE2_ | ||||
NO_AUTO_POSSESS. Note that | ||||
PCRE2_NO_AUTO_POSSESS takes precedence over the pcre2_set_optimi | ||||
ze() optimization directives | ||||
PCRE2_AUTO_POSSESS and PCRE2_AUTO_POSSESS_OFF. | ||||
PCRE2_NO_DOTSTAR_ANCHOR | PCRE2_NO_DOTSTAR_ANCHOR | |||
If this option is set, it disables an optimization that is applied w | If this (deprecated) option is set, it disables an optimization that | |||
hen .* is the first signifi‐ | is applied when .* is the | |||
cant item in a top-level branch of a pattern, and all the other bran | first significant item in a top-level branch of a pattern, and all t | |||
ches also start with .* or | he other branches also start | |||
with \A or \G or ^. The optimization is automatically disabled for | with .* or with \A or \G or ^. The optimization is automatically dis | |||
.* if it is inside an atomic | abled for .* if it is inside | |||
group or a capture group that is the subject of a backreference, or | an atomic group or a capture group that is the subject of a back | |||
if the pattern contains | reference, or if the pattern | |||
(*PRUNE) or (*SKIP). When the optimization is not disabled, such a | contains (*PRUNE) or (*SKIP). When the optimization is not disabled, | |||
pattern is automatically an‐ | such a pattern is automati‐ | |||
chored if PCRE2_DOTALL is set for all the .* items and PCRE2_MULTILI | cally anchored if PCRE2_DOTALL is set for all the .* items and PCRE2 | |||
NE is not set for any ^ | _MULTILINE is not set for | |||
items. Otherwise, the fact that any match must start either at the | any ^ items. Otherwise, the fact that any match must start either at | |||
start of the subject or fol‐ | the start of the subject or | |||
lowing a newline is remembered. Like other optimizations, this can c | following a newline is remembered. Like other optimizations, t | |||
ause callouts to be skipped. | his can cause callouts to be | |||
skipped. (If a compile context is available, it is recommended to u | ||||
se pcre2_set_optimize() with | ||||
the directive PCRE2_DOTSTAR_ANCHOR_OFF instead.) | ||||
PCRE2_NO_START_OPTIMIZE | PCRE2_NO_START_OPTIMIZE | |||
This is an option whose main effect is at matching time. It does not change what pcre2_compile() | This is an option whose main effect is at matching time. It does not change what pcre2_compile() | |||
generates, but it does affect the output of the JIT compiler. | generates, but it does affect the output of the JIT compiler. Settin | |||
g this option is equivalent | ||||
to calling pcre2_set_optimize() with the directive parameter set to | ||||
PCRE2_START_OPTIMIZE_OFF. | ||||
There are a number of optimizations that may occur at the start of a match, in order to speed up | There are a number of optimizations that may occur at the start of a match, in order to speed up | |||
the process. For example, if it is known that an unanchored match mu st start with a specific | the process. For example, if it is known that an unanchored matc h must start with a specific | |||
code unit value, the matching code searches the subject for that val ue, and fails immediately if | code unit value, the matching code searches the subject for that val ue, and fails immediately if | |||
it cannot find it, without actually running the main matching func | it cannot find it, without actually running the main matching functi | |||
tion. This means that a spe‐ | on. The start-up optimiza‐ | |||
cial item such as (*COMMIT) at the start of a pattern is not conside | tions are in effect a pre-scan of the subject that takes place befor | |||
red until after a suitable | e the pattern is run. | |||
starting point for the match has been found. Also, when callouts | ||||
or (*MARK) items are in use, | ||||
these "start-up" optimizations can cause them to be skipped if the | ||||
pattern is never actually | ||||
used. The start-up optimizations are in effect a pre-scan of the sub | ||||
ject that takes place before | ||||
the pattern is run. | ||||
The PCRE2_NO_START_OPTIMIZE option disables the start-up optimizatio | ||||
ns, possibly causing perfor‐ | ||||
mance to suffer, but ensuring that in cases where the result is "no | ||||
match", the callouts do oc‐ | ||||
cur, and that items such as (*COMMIT) and (*MARK) are considered at | ||||
every possible starting po‐ | ||||
sition in the subject string. | ||||
Setting PCRE2_NO_START_OPTIMIZE may change the outcome of a match | ||||
ing operation. Consider the | ||||
pattern | ||||
(*COMMIT)ABC | ||||
When this is compiled, PCRE2 records the fact that a match must star | ||||
t with the character "A". | ||||
Suppose the subject string is "DEFABC". The start-up optimization sc | ||||
ans along the subject, finds | ||||
"A" and runs the first match attempt from there. The (*COMMIT) item | ||||
means that the pattern must | ||||
match the current starting position, which in this case, it does. Ho | ||||
wever, if the same match is | ||||
run with PCRE2_NO_START_OPTIMIZE set, the initial scan along the sub | ||||
ject string does not happen. | ||||
The first match attempt is run starting from "D" and when this f | ||||
ails, (*COMMIT) prevents any | ||||
further matches being tried, so the overall result is "no match". | ||||
As another start-up optimization makes use of a minimum length for a | Disabling the start-up optimizations may cause performance to suff | |||
matching subject, which is | er. However, this may be de‐ | |||
recorded when possible. Consider the pattern | sirable for patterns which contain callouts or items such as (*COMM | |||
IT) and (*MARK). See the | ||||
(*MARK:1)B(*MARK:2)(X|Y) | above description of PCRE2_START_OPTIMIZE_OFF for further details. | |||
The minimum length for a match is two characters. If the subject is | ||||
"XXBB", the "starting char‐ | ||||
acter" optimization skips "XX", then tries to match "BB", which is l | ||||
ong enough. In the process, | ||||
(*MARK:2) is encountered and remembered. When the match attempt f | ||||
ails, the next "B" is found, | ||||
but there is only one character left, so there are no more attempts, | ||||
and "no match" is returned | ||||
with the "last mark seen" set to "2". If NO_START_OPTIMIZE is set, h | ||||
owever, matches are tried at | ||||
every possible starting position, including at the end of the sub | ||||
ject, where (*MARK:1) is en‐ | ||||
countered, but there is no "B", so the "last mark seen" that is retu | ||||
rned is "1". In this case, | ||||
the optimizations do not affect the overall match result, which is s | ||||
till "no match", but they do | ||||
affect the auxiliary information that is returned. | ||||
PCRE2_NO_UTF_CHECK | PCRE2_NO_UTF_CHECK | |||
When PCRE2_UTF is set, the validity of the pattern as a UTF stri ng is automatically checked. | When PCRE2_UTF is set, the validity of the pattern as a UTF stri ng is automatically checked. | |||
There are discussions about the validity of UTF-8 strings, UTF-16 st rings, and UTF-32 strings in | There are discussions about the validity of UTF-8 strings, UTF-16 st rings, and UTF-32 strings in | |||
the pcre2unicode document. If an invalid UTF sequence is found, pcre 2_compile() returns a nega‐ | the pcre2unicode document. If an invalid UTF sequence is found, pcre 2_compile() returns a nega‐ | |||
tive error code. | tive error code. | |||
If you know that your pattern is a valid UTF string, and you want to skip this check for perfor‐ | If you know that your pattern is a valid UTF string, and you want to skip this check for perfor‐ | |||
mance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it i s set, the effect of passing | mance reasons, you can set the PCRE2_NO_UTF_CHECK option. When it i s set, the effect of passing | |||
skipping to change at line 1444 | skipping to change at line 1547 | |||
This option has two effects. Firstly, it change the way PCRE2 proce sses \B, \b, \D, \d, \S, \s, | This option has two effects. Firstly, it change the way PCRE2 proce sses \B, \b, \D, \d, \S, \s, | |||
\W, \w, and some of the POSIX character classes. By default, only AS CII characters are recog‐ | \W, \w, and some of the POSIX character classes. By default, only AS CII characters are recog‐ | |||
nized, but if PCRE2_UCP is set, Unicode properties are used to cl assify characters. There are | nized, but if PCRE2_UCP is set, Unicode properties are used to cl assify characters. There are | |||
some PCRE2_EXTRA options (see below) that add finer control to this behaviour. More details are | some PCRE2_EXTRA options (see below) that add finer control to this behaviour. More details are | |||
given in the section on generic character types in the pcre2pattern page. | given in the section on generic character types in the pcre2pattern page. | |||
The second effect of PCRE2_UCP is to force the use of Unicode prope rties for upper/lower casing | The second effect of PCRE2_UCP is to force the use of Unicode prope rties for upper/lower casing | |||
operations, even when PCRE2_UTF is not set. This makes it possible t o process strings in the | operations, even when PCRE2_UTF is not set. This makes it possible t o process strings in the | |||
16-bit UCS-2 code. This option is available only if PCRE2 has been c ompiled with Unicode support | 16-bit UCS-2 code. This option is available only if PCRE2 has been c ompiled with Unicode support | |||
(which is the default). The PCRE2_EXTRA_CASELESS_RESTRICT option (s | (which is the default). | |||
ee below) restricts caseless | ||||
matching such that ASCII characters match only ASCII characters an | The PCRE2_EXTRA_CASELESS_RESTRICT option (see above) restricts casel | |||
d non-ASCII characters match | ess matching such that ASCII | |||
only non-ASCII characters. | characters match only ASCII characters and non-ASCII characters matc | |||
h only non-ASCII characters. | ||||
The PCRE2_EXTRA_TURKISH_CASING option (see above) alters the match | ||||
ing of the 'i' characters to | ||||
follow their behaviour in Turkish and Azeri languages. For further d | ||||
etails on PCRE2_EXTRA_CASE‐ | ||||
LESS_RESTRICT and PCRE2_EXTRA_TURKISH_CASING, see the pcre2unicode p | ||||
age. | ||||
PCRE2_UNGREEDY | PCRE2_UNGREEDY | |||
This option inverts the "greediness" of the quantifiers so that they | This option inverts the "greediness" of the quantifiers so that the | |||
are not greedy by default, | y are not greedy by default, | |||
but become greedy if followed by "?". It is not compatible with P | but become greedy if followed by "?". It is not compatible with Perl | |||
erl. It can also be set by a | . It can also be set by a | |||
(?U) option setting within the pattern. | (?U) option setting within the pattern. | |||
PCRE2_USE_OFFSET_LIMIT | PCRE2_USE_OFFSET_LIMIT | |||
This option must be set for pcre2_compile() if pcre2_set_offset_limi t() is going to be used to | This option must be set for pcre2_compile() if pcre2_set_offset_li mit() is going to be used to | |||
set a non-default offset limit in a match context for matches that u se this pattern. An error is | set a non-default offset limit in a match context for matches that u se this pattern. An error is | |||
generated if an offset limit is set without this option. For more | generated if an offset limit is set without this option. For more de | |||
details, see the description | tails, see the description | |||
of pcre2_set_offset_limit() in the section that describes matc | of pcre2_set_offset_limit() in the section that describes mat | |||
h contexts. See also the | ch contexts. See also the | |||
PCRE2_FIRSTLINE option above. | PCRE2_FIRSTLINE option above. | |||
PCRE2_UTF | PCRE2_UTF | |||
This option causes PCRE2 to regard both the pattern and the sub ject strings that are subse‐ | This option causes PCRE2 to regard both the pattern and the subject strings that are subse‐ | |||
quently processed as strings of UTF characters instead of single-cod e-unit strings. It is avail‐ | quently processed as strings of UTF characters instead of single-cod e-unit strings. It is avail‐ | |||
able when PCRE2 is built to include Unicode support (which is the de fault). If Unicode support | able when PCRE2 is built to include Unicode support (which is the default). If Unicode support | |||
is not available, the use of this option provokes an error. Details of how PCRE2_UTF changes the | is not available, the use of this option provokes an error. Details of how PCRE2_UTF changes the | |||
behaviour of PCRE2 are given in the pcre2unicode page. In particul ar, note that it changes the | behaviour of PCRE2 are given in the pcre2unicode page. In particular , note that it changes the | |||
way PCRE2_CASELESS works. | way PCRE2_CASELESS works. | |||
Extra compile options | Extra compile options | |||
The option bits that can be set in a compile context by calling the pcre2_set_compile_extra_op‐ | The option bits that can be set in a compile context by calling the pcre2_set_compile_extra_op‐ | |||
tions() function are as follows: | tions() function are as follows: | |||
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK | PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK | |||
Since release 10.38 PCRE2 has forbidden the use of \K within look | Since release 10.38 PCRE2 has forbidden the use of \K within lookar | |||
around assertions, following | ound assertions, following | |||
Perl's lead. This option is provided to re-enable the previous b | Perl's lead. This option is provided to re-enable the previou | |||
ehaviour (act in positive | s behaviour (act in positive | |||
lookarounds, ignore in negative ones) in case anybody is relying on it. | lookarounds, ignore in negative ones) in case anybody is relying on it. | |||
PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES | PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES | |||
This option applies when compiling a pattern in UTF-8 or UTF-32 mod | This option applies when compiling a pattern in UTF-8 or UTF-32 mode | |||
e. It is forbidden in UTF-16 | . It is forbidden in UTF-16 | |||
mode, and ignored in non-UTF modes. Unicode "surrogate" code points | mode, and ignored in non-UTF modes. Unicode "surrogate" code p | |||
in the range 0xd800 to | oints in the range 0xd800 to | |||
0xdfff are used in pairs in UTF-16 to encode code points with va | 0xdfff are used in pairs in UTF-16 to encode code points with values | |||
lues in the range 0x10000 to | in the range 0x10000 to | |||
0x10ffff. The surrogates cannot therefore be represented in UTF-16. | 0x10ffff. The surrogates cannot therefore be represented in UTF-16 | |||
They can be represented in | . They can be represented in | |||
UTF-8 and UTF-32, but are defined as invalid code points, and caus | UTF-8 and UTF-32, but are defined as invalid code points, and cause | |||
e errors if encountered in a | errors if encountered in a | |||
UTF-8 or UTF-32 string that is being checked for validity by PCRE2. | UTF-8 or UTF-32 string that is being checked for validity by PCRE2. | |||
These values also cause errors if encountered in escape sequences su ch as \x{d912} within a pat‐ | These values also cause errors if encountered in escape sequences su ch as \x{d912} within a pat‐ | |||
tern. However, it seems that some applications, when using PCRE2 to | tern. However, it seems that some applications, when using PCRE2 t | |||
check for unwanted charac‐ | o check for unwanted charac‐ | |||
ters in UTF-8 strings, explicitly test for the surrogates u | ters in UTF-8 strings, explicitly test for the surrogates us | |||
sing escape sequences. The | ing escape sequences. The | |||
PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be cause it applies only to the | PCRE2_NO_UTF_CHECK option does not disable the error that occurs, be cause it applies only to the | |||
testing of input strings for UTF validity. | testing of input strings for UTF validity. | |||
If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, surr ogate code point values in | If the extra option PCRE2_EXTRA_ALLOW_SURROGATE_ESCAPES is set, su rrogate code point values in | |||
UTF-8 and UTF-32 patterns no longer provoke errors and are incorpora ted in the compiled pattern. | UTF-8 and UTF-32 patterns no longer provoke errors and are incorpora ted in the compiled pattern. | |||
However, they can only match subject characters if the matc hing function is called with | However, they can only match subject characters if the matching function is called with | |||
PCRE2_NO_UTF_CHECK set. | PCRE2_NO_UTF_CHECK set. | |||
PCRE2_EXTRA_ALT_BSUX | PCRE2_EXTRA_ALT_BSUX | |||
The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u, a | The original option PCRE2_ALT_BSUX causes PCRE2 to process \U, \u | |||
nd \x in the way that EC‐ | , and \x in the way that EC‐ | |||
MAscript (aka JavaScript) does. Additional functionality was defi | MAscript (aka JavaScript) does. Additional functionality was defined | |||
ned by ECMAscript 6; setting | by ECMAscript 6; setting | |||
PCRE2_EXTRA_ALT_BSUX has the effect of PCRE2_ALT_BSUX, but in additi | PCRE2_EXTRA_ALT_BSUX has the effect of PCRE2_ALT_BSUX, but in addi | |||
on it recognizes \u{hhh..} | tion it recognizes \u{hhh..} | |||
as a hexadecimal character code, where hhh.. is any number of hexade cimal digits. | as a hexadecimal character code, where hhh.. is any number of hexade cimal digits. | |||
PCRE2_EXTRA_ASCII_BSD | PCRE2_EXTRA_ASCII_BSD | |||
This option forces \d to match only ASCII digits, even when PCRE2_UC P is set. It can be changed | This option forces \d to match only ASCII digits, even when PCRE2_UC P is set. It can be changed | |||
within a pattern by means of the (?aD) option setting. | within a pattern by means of the (?aD) option setting. | |||
PCRE2_EXTRA_ASCII_BSS | PCRE2_EXTRA_ASCII_BSS | |||
This option forces \s to match only ASCII space characters, even w hen PCRE2_UCP is set. It can | This option forces \s to match only ASCII space characters, even whe n PCRE2_UCP is set. It can | |||
be changed within a pattern by means of the (?aS) option setting. | be changed within a pattern by means of the (?aS) option setting. | |||
PCRE2_EXTRA_ASCII_BSW | PCRE2_EXTRA_ASCII_BSW | |||
This option forces \w to match only ASCII word characters, even when PCRE2_UCP is set. It can be | This option forces \w to match only ASCII word characters, even when PCRE2_UCP is set. It can be | |||
changed within a pattern by means of the (?aW) option setting. | changed within a pattern by means of the (?aW) option setting. | |||
PCRE2_EXTRA_ASCII_DIGIT | PCRE2_EXTRA_ASCII_DIGIT | |||
This option forces the POSIX character classes [:digit:] and [:xdigi t:] to match only ASCII dig‐ | This option forces the POSIX character classes [:digit:] and [:xdigi t:] to match only ASCII dig‐ | |||
its, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option | its, even when PCRE2_UCP is set. It can be changed within a pattern by means of the (?aT) option | |||
setting. | setting. | |||
PCRE2_EXTRA_ASCII_POSIX | PCRE2_EXTRA_ASCII_POSIX | |||
This option forces all the POSIX character classes, including [:digi t:] and [:xdigit:], to match | This option forces all the POSIX character classes, including [:digi t:] and [:xdigit:], to match | |||
only ASCII characters, even when PCRE2_UCP is set. It can be changed | only ASCII characters, even when PCRE2_UCP is set. It can be chang | |||
within a pattern by means | ed within a pattern by means | |||
of the (?aP) option setting, but note that this also sets PCRE2_E | of the (?aP) option setting, but note that this also sets PCRE2_EXTR | |||
XTRA_ASCII_DIGIT in order to | A_ASCII_DIGIT in order to | |||
ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. | ensure that (?-aP) unsets all ASCII restrictions for POSIX classes. | |||
PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL | PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL | |||
This is a dangerous option. Use with care. By default, an unrecogniz | This is a dangerous option. Use with care. By default, an unrecog | |||
ed escape such as \j or a | nized escape such as \j or a | |||
malformed one such as \x{2z} causes a compile-time error when detec | malformed one such as \x{2z} causes a compile-time error when detect | |||
ted by pcre2_compile(). Perl | ed by pcre2_compile(). Perl | |||
is somewhat inconsistent in handling such items: for example, \j is | is somewhat inconsistent in handling such items: for example, \j | |||
treated as a literal "j", | is treated as a literal "j", | |||
and non-hexadecimal digits in \x{} are just ignored, though warning | and non-hexadecimal digits in \x{} are just ignored, though warnings | |||
s are given in both cases if | are given in both cases if | |||
Perl's warning switch is enabled. However, a malformed octal number | Perl's warning switch is enabled. However, a malformed octal numbe | |||
after \o{ always causes an | r after \o{ always causes an | |||
error in Perl. | error in Perl. | |||
If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to p cre2_compile(), all unrecog‐ | If the PCRE2_EXTRA_BAD_ESCAPE_IS_LITERAL extra option is passed to p cre2_compile(), all unrecog‐ | |||
nized or malformed escape sequences are treated as single-character | nized or malformed escape sequences are treated as single-character | |||
escapes. For example, \j is | escapes. For example, \j is | |||
a literal "j" and \x{2z} is treated as the literal string "x{2z}". | a literal "j" and \x{2z} is treated as the literal string "x{2z} | |||
Setting this option means | ". Setting this option means | |||
that typos in patterns may go undetected and have unexpected result | that typos in patterns may go undetected and have unexpected results | |||
s. Also note that a sequence | . Also note that a sequence | |||
such as [\N{] is interpreted as a malformed attempt at [\N{...}] an | such as [\N{] is interpreted as a malformed attempt at [\N{... | |||
d so is treated as [N{] | }] and so is treated as [N{] | |||
whereas [\N] gives an error because an unqualified \N is a valid esc ape sequence but is not sup‐ | whereas [\N] gives an error because an unqualified \N is a valid esc ape sequence but is not sup‐ | |||
ported in a character class. To reiterate: this is a dangerous optio n. Use with great care. | ported in a character class. To reiterate: this is a dangerous optio n. Use with great care. | |||
PCRE2_EXTRA_CASELESS_RESTRICT | PCRE2_EXTRA_CASELESS_RESTRICT | |||
When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follow s Unicode rules, which allow | When either PCRE2_UCP or PCRE2_UTF is set, caseless matching follows Unicode rules, which allow | |||
for more than two cases per character. There are two case-equivalent character sets that contain | for more than two cases per character. There are two case-equivalent character sets that contain | |||
both ASCII and non-ASCII characters. The ASCII letter S is case-equi valent to U+017f (long S) | both ASCII and non-ASCII characters. The ASCII letter S is case-e quivalent to U+017f (long S) | |||
and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). T his option disables recogni‐ | and the ASCII letter K is case-equivalent to U+212a (Kelvin sign). T his option disables recogni‐ | |||
tion of case-equivalences that cross the ASCII/non-ASCII boundar | tion of case-equivalences that cross the ASCII/non-ASCII boundary. I | |||
y. In a caseless match, both | n a caseless match, both | |||
characters must either be ASCII or non-ASCII. The option can be chan | characters must either be ASCII or non-ASCII. The option can be cha | |||
ged with a pattern by the | nged within a pattern by the | |||
(?r) option setting. | (*CASELESS_RESTRICT) or (?r) option settings. | |||
PCRE2_EXTRA_ESCAPED_CR_IS_LF | PCRE2_EXTRA_ESCAPED_CR_IS_LF | |||
There are some legacy applications where the escape sequence \r | There are some legacy applications where the escape sequence \r in | |||
in a pattern is expected to | a pattern is expected to | |||
match a newline. If this option is set, \r in a pattern is converted | match a newline. If this option is set, \r in a pattern is converte | |||
to \n so that it matches a | d to \n so that it matches a | |||
LF (linefeed) instead of a CR (carriage return) character. The opti | LF (linefeed) instead of a CR (carriage return) character. The optio | |||
on does not affect a literal | n does not affect a literal | |||
CR in the pattern, nor does it affect CR specified as an explicit co de point such as \x{0D}. | CR in the pattern, nor does it affect CR specified as an explicit co de point such as \x{0D}. | |||
PCRE2_EXTRA_MATCH_LINE | PCRE2_EXTRA_MATCH_LINE | |||
This option is provided for use by the -x option of pcre2grep. It ca | This option is provided for use by the -x option of pcre2grep. I | |||
uses the pattern only to | t causes the pattern only to | |||
match complete lines. This is achieved by automatically insertin | match complete lines. This is achieved by automatically inserting th | |||
g the code for "^(?:" at the | e code for "^(?:" at the | |||
start of the compiled pattern and ")$" at the end. Thus, when PC | start of the compiled pattern and ")$" at the end. Thus, when | |||
RE2_MULTILINE is set, the | PCRE2_MULTILINE is set, the | |||
matched line may be in the middle of the subject string. This option can be used with PCRE2_LIT‐ | matched line may be in the middle of the subject string. This option can be used with PCRE2_LIT‐ | |||
ERAL. | ERAL. | |||
PCRE2_EXTRA_MATCH_WORD | PCRE2_EXTRA_MATCH_WORD | |||
This option is provided for use by the -w option of pcre2grep. I | This option is provided for use by the -w option of pcre2grep. It ca | |||
t causes the pattern only to | uses the pattern only to | |||
match strings that have a word boundary at the start and the end. Th | match strings that have a word boundary at the start and the end. T | |||
is is achieved by automati‐ | his is achieved by automati‐ | |||
cally inserting the code for "\b(?:" at the start of the compiled p | cally inserting the code for "\b(?:" at the start of the compiled pa | |||
attern and ")\b" at the end. | ttern and ")\b" at the end. | |||
The option may be used with PCRE2_LITERAL. However, it is ignored if | The option may be used with PCRE2_LITERAL. However, it is ignored | |||
PCRE2_EXTRA_MATCH_LINE is | if PCRE2_EXTRA_MATCH_LINE is | |||
also set. | also set. | |||
PCRE2_EXTRA_NO_BS0 | ||||
If this option is set (note that its final character is the digit 0) | ||||
it locks out the use of the | ||||
sequence \0 unless at least one more octal digit follows. | ||||
PCRE2_EXTRA_PYTHON_OCTAL | ||||
If this option is set, PCRE2 follows Python's rules for interpreting | ||||
octal escape sequences. The | ||||
rules for handling sequences such as \14, which could be an octal nu | ||||
mber or a back reference are | ||||
different. Details are given in the pcre2pattern documentation. | ||||
PCRE2_EXTRA_NEVER_CALLOUT | ||||
If this option is set, PCRE2 treats callouts in the pattern as | ||||
a syntax error, returning | ||||
PCRE2_ERROR_CALLOUT_CALLER_DISABLED. This is useful if the applicati | ||||
on knows that a callout will | ||||
not be provided to pcre2_match(), so that callouts in the pattern ar | ||||
e not silently ignored. | ||||
PCRE2_EXTRA_TURKISH_CASING | ||||
This option alters case-equivalence of the 'i' letters to follow | ||||
the alphabet used by Turkish | ||||
and Azeri languages. The option can be changed within a pattern by t | ||||
he (*TURKISH_CASING) start- | ||||
of-pattern setting. Either the UTF or UCP options must be set. In th | ||||
e 8-bit library, UTF must be | ||||
set. This option cannot be combined with PCRE2_EXTRA_CASELESS_RESTRI | ||||
CT. | ||||
JUST-IN-TIME (JIT) COMPILATION | JUST-IN-TIME (JIT) COMPILATION | |||
int pcre2_jit_compile(pcre2_code *code, uint32_t options); | int pcre2_jit_compile(pcre2_code *code, uint32_t options); | |||
int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, | int pcre2_jit_match(const pcre2_code *code, PCRE2_SPTR subject, | |||
PCRE2_SIZE length, PCRE2_SIZE startoffset, | PCRE2_SIZE length, PCRE2_SIZE startoffset, | |||
uint32_t options, pcre2_match_data *match_data, | uint32_t options, pcre2_match_data *match_data, | |||
pcre2_match_context *mcontext); | pcre2_match_context *mcontext); | |||
void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); | void pcre2_jit_free_unused_memory(pcre2_general_context *gcontext); | |||
skipping to change at line 1752 | skipping to change at line 1883 | |||
When .* is the first significant item, anchoring is possible only wh en all the following are | When .* is the first significant item, anchoring is possible only wh en all the following are | |||
true: | true: | |||
.* is not in an atomic group | .* is not in an atomic group | |||
.* is not in a capture group that is the subject | .* is not in a capture group that is the subject | |||
of a backreference | of a backreference | |||
PCRE2_DOTALL is in force for .* | PCRE2_DOTALL is in force for .* | |||
Neither (*PRUNE) nor (*SKIP) appears in the pattern | Neither (*PRUNE) nor (*SKIP) appears in the pattern | |||
PCRE2_NO_DOTSTAR_ANCHOR is not set | PCRE2_NO_DOTSTAR_ANCHOR is not set | |||
Dotstar anchoring has not been disabled with PCRE2_DOTSTAR_ANCHOR_ OFF | ||||
For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for | For patterns that are auto-anchored, the PCRE2_ANCHORED bit is set in the options returned for | |||
PCRE2_INFO_ALLOPTIONS. | PCRE2_INFO_ALLOPTIONS. | |||
PCRE2_INFO_BACKREFMAX | PCRE2_INFO_BACKREFMAX | |||
Return the number of the highest backreference in the pattern. The t hird argument should point | Return the number of the highest backreference in the pattern. The t hird argument should point | |||
to a uint32_t variable. Named capture groups acquire numbers as we ll as names, and these count | to a uint32_t variable. Named capture groups acquire numbers as we ll as names, and these count | |||
towards the highest backreference. Backreferences such as \4 or \g{1 2} match the captured char‐ | towards the highest backreference. Backreferences such as \4 or \g{1 2} match the captured char‐ | |||
acters of the given group, but in addition, the check that a capt ure group is set in a condi‐ | acters of the given group, but in addition, the check that a capt ure group is set in a condi‐ | |||
skipping to change at line 2805 | skipping to change at line 2937 | |||
If the function is not successful, the value set via outlengthptr d epends on the type of error. | If the function is not successful, the value set via outlengthptr d epends on the type of error. | |||
For syntax errors in the replacement string, the value is the offset in the replacement string | For syntax errors in the replacement string, the value is the offset in the replacement string | |||
where the error was detected. For other errors, the value is PCRE 2_UNSET by default. This in‐ | where the error was detected. For other errors, the value is PCRE 2_UNSET by default. This in‐ | |||
cludes the case of the output buffer being too small, unless PCRE2_S UBSTITUTE_OVERFLOW_LENGTH is | cludes the case of the output buffer being too small, unless PCRE2_S UBSTITUTE_OVERFLOW_LENGTH is | |||
set. | set. | |||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the outpu t buffer is too small. The | PCRE2_SUBSTITUTE_OVERFLOW_LENGTH changes what happens when the outpu t buffer is too small. The | |||
default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, | default action is to return PCRE2_ERROR_NOMEMORY immediately. If this option is set, however, | |||
pcre2_substitute() continues to go through the motions of matching a nd substituting (without, of | pcre2_substitute() continues to go through the motions of matching a nd substituting (without, of | |||
course, writing anything) in order to compute the size of buffer tha | course, writing anything) in order to compute the size of buffer tha | |||
t is needed. This value is | t is needed, which will in‐ | |||
passed back via the outlengthptr variable, with the result of the fu | clude the extra space for the terminating NUL. This value is pass | |||
nction still being PCRE2_ER‐ | ed back via the outlengthptr | |||
ROR_NOMEMORY. | variable, with the result of the function still being PCRE2_ERROR_NO | |||
MEMORY. | ||||
Passing a buffer size of zero is a permitted way of finding out h | Passing a buffer size of zero is a permitted way of finding out how | |||
ow much memory is needed for | much memory is needed for | |||
given substitution. However, this does mean that the entire operatio | given substitution. However, this does mean that the entire operati | |||
n is carried out twice. De‐ | on is carried out twice. De‐ | |||
pending on the application, it may be more efficient to allocate a l arge buffer and free the ex‐ | pending on the application, it may be more efficient to allocate a l arge buffer and free the ex‐ | |||
cess afterwards, instead of using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH. | cess afterwards, instead of using PCRE2_SUBSTITUTE_OVERFLOW_LENGTH. | |||
The replacement string, which is interpreted as a UTF string in UTF mode, is checked for UTF va‐ | The replacement string, which is interpreted as a UTF string in UTF mode, is checked for UTF va‐ | |||
lidity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF replacemen t string causes an immediate | lidity unless PCRE2_NO_UTF_CHECK is set. An invalid UTF replacement string causes an immediate | |||
return with the relevant UTF error code. | return with the relevant UTF error code. | |||
If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not in terpreted in any way. By de‐ | If PCRE2_SUBSTITUTE_LITERAL is set, the replacement string is not in terpreted in any way. By de‐ | |||
fault, however, a dollar character is an escape character that can | fault, however, a dollar character is an escape character that | |||
specify the insertion of | can specify the insertion of | |||
characters from capture groups and names from (*MARK) or other c | characters from capture groups and names from (*MARK) or other contr | |||
ontrol verbs in the pattern. | ol verbs in the pattern. | |||
Dollar is the only escape character (backslash is treated as literal | Dollar is the only escape character (backslash is treated as liter | |||
). The following forms are | al). The following forms are | |||
always recognized: | recognized: | |||
$$ insert a dollar character | $$ insert a dollar character | |||
$<n> or ${<n>} insert the contents of group <n> | $n or ${n} insert the contents of group n | |||
$0 or $& insert the entire matched substring | ||||
$` insert the substring that precedes the match | ||||
$' insert the substring that follows the match | ||||
$_ insert the entire input string | ||||
$*MARK or ${*MARK} insert a control verb name | $*MARK or ${*MARK} insert a control verb name | |||
Either a group number or a group name can be given for <n>. Curly b | Either a group number or a group name can be given for n, for exampl | |||
rackets are required only if | e $2 or $NAME. Curly brack‐ | |||
the following character would be interpreted as part of the number o | ets are required only if the following character would be interpre | |||
r name. The number may be | ted as part of the number or | |||
zero to include the entire matched string. For example, if the p | name. The number may be zero to include the entire matched string. F | |||
attern a(b)c is matched with | or example, if the pattern | |||
"=abc=" and the replacement string "+$1$0$1+", the result is "=+babc | a(b)c is matched with "=abc=" and the replacement string "+$1$0$1+", | |||
b+=". | the result is "=+babcb+=". | |||
The JavaScript form $<name>, where the angle brackets are part of th | ||||
e syntax, is also recognized | ||||
for group names, but not for group numbers or *MARK. | ||||
$*MARK inserts the name from the last encountered backtracking contr ol verb on the matching path | $*MARK inserts the name from the last encountered backtracking contr ol verb on the matching path | |||
that has a name. (*MARK) must always include a name, but the other v erbs need not. For example, | that has a name. (*MARK) must always include a name, but the other verbs need not. For example, | |||
in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for ( *MARK:A)(*PRUNE:B) the rele‐ | in the case of (*MARK:A)(*PRUNE) the name inserted is "A", but for ( *MARK:A)(*PRUNE:B) the rele‐ | |||
vant name is "B". This facility can be used to perform simple si multaneous substitutions, as | vant name is "B". This facility can be used to perform simple simu ltaneous substitutions, as | |||
this pcre2test example shows: | this pcre2test example shows: | |||
/(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} | /(*MARK:pear)apple|(*MARK:orange)lemon/g,replace=${*MARK} | |||
apple lemon | apple lemon | |||
2: pear orange | 2: pear orange | |||
PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the subj | PCRE2_SUBSTITUTE_GLOBAL causes the function to iterate over the sub | |||
ect string, replacing every | ject string, replacing every | |||
matching substring. If this option is not set, only the first mat | matching substring. If this option is not set, only the first matchi | |||
ching substring is replaced. | ng substring is replaced. | |||
The search for matches takes place in the original subject string (t | The search for matches takes place in the original subject string | |||
hat is, previous replace‐ | (that is, previous replace‐ | |||
ments do not affect it). Iteration is implemented by advancing th | ments do not affect it). Iteration is implemented by advancing the | |||
e startoffset value for each | startoffset value for each | |||
search, which is always passed the entire subject string. If an offs et limit is set in the match | search, which is always passed the entire subject string. If an offs et limit is set in the match | |||
context, searching stops when that limit is reached. | context, searching stops when that limit is reached. | |||
You can restrict the effect of a global substitution to a portion of the subject string by set‐ | You can restrict the effect of a global substitution to a portion o f the subject string by set‐ | |||
ting either or both of startoffset and an offset limit. Here is a pc re2test example: | ting either or both of startoffset and an offset limit. Here is a pc re2test example: | |||
/B/g,replace=!,use_offset_limit | /B/g,replace=!,use_offset_limit | |||
ABC ABC ABC ABC\=offset=3,offset_limit=12 | ABC ABC ABC ABC\=offset=3,offset_limit=12 | |||
2: ABC A!C A!C ABC | 2: ABC A!C A!C ABC | |||
When continuing with global substitutions after matching a substr ing with zero length, an at‐ | When continuing with global substitutions after matching a substring with zero length, an at‐ | |||
tempt to find a non-empty match at the same offset is performed. If this is not successful, the | tempt to find a non-empty match at the same offset is performed. If this is not successful, the | |||
offset is advanced by one character except when CRLF is a valid newl ine sequence and the next | offset is advanced by one character except when CRLF is a valid n ewline sequence and the next | |||
two characters are CR, LF. In this case, the offset is advanced by t wo characters. | two characters are CR, LF. In this case, the offset is advanced by t wo characters. | |||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture grou | PCRE2_SUBSTITUTE_UNKNOWN_UNSET causes references to capture groups t | |||
ps that do not appear in the | hat do not appear in the | |||
pattern to be treated as unset groups. This option should be used wi | pattern to be treated as unset groups. This option should be used | |||
th care, because it means | with care, because it means | |||
that a typo in a group name or number no longer causes the PCRE2_ERR OR_NOSUBSTRING error. | that a typo in a group name or number no longer causes the PCRE2_ERR OR_NOSUBSTRING error. | |||
PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (incl uding unknown groups when | PCRE2_SUBSTITUTE_UNSET_EMPTY causes unset capture groups (inclu ding unknown groups when | |||
PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty string s when inserted as described | PCRE2_SUBSTITUTE_UNKNOWN_UNSET is set) to be treated as empty string s when inserted as described | |||
above. If this option is not set, an attempt to insert an unset grou p causes the PCRE2_ERROR_UN‐ | above. If this option is not set, an attempt to insert an unset grou p causes the PCRE2_ERROR_UN‐ | |||
SET error. This option does not influence the extended substitution syntax described below. | SET error. This option does not influence the extended substitution syntax described below. | |||
PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to t he replacement string. With‐ | PCRE2_SUBSTITUTE_EXTENDED causes extra processing to be applied to t he replacement string. With‐ | |||
out this option, only the dollar character is special, and only the group insertion forms listed | out this option, only the dollar character is special, and only the group insertion forms listed | |||
above are valid. When PCRE2_SUBSTITUTE_EXTENDED is set, two things c hange: | above are valid. When PCRE2_SUBSTITUTE_EXTENDED is set, several thin gs change: | |||
Firstly, backslash in a replacement string is interpreted as an es | Firstly, backslash in a replacement string is interpreted as an | |||
cape character. The usual | escape character. The usual | |||
forms such as \n or \x{ddd} can be used to specify particular ch | forms such as \x{ddd} can be used to specify particular character co | |||
aracter codes, and backslash | des, and backslash followed | |||
followed by any non-alphanumeric character quotes that character. Ex | by any non-alphanumeric character quotes that character. Extende | |||
tended quoting can be coded | d quoting can be coded using | |||
using \Q...\E, exactly as in pattern strings. | \Q...\E, exactly as in pattern strings. The escapes \b and \v are in | |||
terpreted as the characters | ||||
There are also four escape sequences for forcing the case of inse | backspace and vertical tab, respectively. | |||
rted letters. The insertion | ||||
mechanism has three states: no case forcing, force upper case, and f | The interpretation of backslash followed by one or more digits | |||
orce lower case. The escape | is the same as in a pattern, | |||
sequences change the current state: \U and \L change to upper or | which in Perl has some ambiguities. Details are given in the pcre2pa | |||
lower case forcing, respec‐ | ttern page. | |||
tively, and \E (when not terminating a \Q quoted sequence) reverts t | ||||
o no case forcing. The se‐ | The Python form \g<n>, where the angle brackets are part of the synt | |||
quences \u and \l force the next character (if it is a letter) to | ax and n is either a group | |||
upper or lower case, respec‐ | name or number, is recognized as an altertive way of inserting the | |||
tively, and then the state automatically reverts to no case forcing. | contents of a group, for ex‐ | |||
Case forcing applies to all | ample \g<3>. | |||
inserted characters, including those from capture groups and letters | ||||
within \Q...\E quoted se‐ | There are also four escape sequences for forcing the case of inserte | |||
quences. If either PCRE2_UTF or PCRE2_UCP was set when the pattern w | d letters. Case forcing ap‐ | |||
as compiled, Unicode proper‐ | plies to all inserted characters, including those from capture group | |||
ties are used for case forcing characters whose code points are grea | s and letters within \Q...\E | |||
ter than 127. | quoted sequences. The insertion mechanism has three states: no case | |||
forcing, force upper case, | ||||
and force lower case. The escape sequences change the current stat | ||||
e: \U and \L change to upper | ||||
or lower case forcing, respectively, and \E (when not terminating a | ||||
\Q quoted sequence) reverts | ||||
to no case forcing. The sequences \u and \l force the next characte | ||||
r (if it is a letter) to up‐ | ||||
per or lower case, respectively, and then the state automatically re | ||||
verts to no case forcing. | ||||
However, if \u is immediately followed by \L or \l is immediately fo | ||||
llowed by \U, the next char‐ | ||||
acter's case is forced by the first escape sequence, and subsequent | ||||
characters by the second. | ||||
This provides a "title casing" facility that can be applied to gro | ||||
up captures. For example, if | ||||
group 1 has captured "heLLo", the replacement string "\u\L$1" become | ||||
s "Hello". | ||||
If either PCRE2_UTF or PCRE2_UCP was set when the pattern was compil | ||||
ed, Unicode properties are | ||||
used for case forcing characters whose code points are greater th | ||||
an 127. However, only simple | ||||
case folding, as determined by the Unicode file CaseFolding.txt is | ||||
supported. PCRE2 does not | ||||
support language-specific special casing rules such as using diffe | ||||
rent lower case Greek sigmas | ||||
in the middle and ends of words (as defined in the Unicode file Spec | ||||
ialCasing.txt). | ||||
Note that case forcing sequences such as \U...\E do not nest. For | Note that case forcing sequences such as \U...\E do not nest. For ex | |||
example, the result of pro‐ | ample, the result of pro‐ | |||
cessing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no ef | cessing "\Uaa\LBB\Ecc\E" is "AAbbcc"; the final \E has no e | |||
fect. Note also that the | ffect. Note also that the | |||
PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do not apply to repl acement strings. | PCRE2_ALT_BSUX and PCRE2_EXTRA_ALT_BSUX options do not apply to repl acement strings. | |||
The second effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add more flexibility to capture | The final effect of setting PCRE2_SUBSTITUTE_EXTENDED is to add mo re flexibility to capture | |||
group substitution. The syntax is similar to that used by Bash: | group substitution. The syntax is similar to that used by Bash: | |||
${<n>:-<string>} | ${n:-string} | |||
${<n>:+<string1>:<string2>} | ${n:+string1:string2} | |||
As before, <n> may be a group number or a name. The first form speci | As in the simple case, n may be a group number or a name. The fi | |||
fies a default value. If | rst form specifies a default | |||
group <n> is set, its value is inserted; if not, <string> is expan | value. If group n is set, its value is inserted; if not, the string | |||
ded and the result inserted. | is expanded and the result | |||
The second form specifies strings that are expanded and inserted whe | inserted. The second form specifies strings that are expanded and | |||
n group <n> is set or unset, | inserted when group n is set | |||
respectively. The first form is just a convenient shorthand for | or unset, respectively. The first form is just a convenient shorthan | |||
d for | ||||
${<n>:+${<n>}:<string>} | ${n:+${n}:string} | |||
Backslash can be used to escape colons and closing curly brackets in the replacement strings. A | Backslash can be used to escape colons and closing curly brackets in the replacement strings. A | |||
change of the case forcing state within a replacement string rem ains in force afterwards, as | change of the case forcing state within a replacement string rem ains in force afterwards, as | |||
shown in this pcre2test example: | shown in this pcre2test example: | |||
/(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo | /(some)?(body)/substitute_extended,replace=${1:+\U:\L}HeLLo | |||
body | body | |||
1: hello | 1: hello | |||
somebody | somebody | |||
1: HELLO | 1: HELLO | |||
skipping to change at line 2963 | skipping to change at line 3119 | |||
Substitution callouts | Substitution callouts | |||
int pcre2_set_substitute_callout(pcre2_match_context *mcontext, | int pcre2_set_substitute_callout(pcre2_match_context *mcontext, | |||
int (*callout_function)(pcre2_substitute_callout_block *, void *), | int (*callout_function)(pcre2_substitute_callout_block *, void *), | |||
void *callout_data); | void *callout_data); | |||
The pcre2_set_substitution_callout() function can be used to specif y a callout function for | The pcre2_set_substitution_callout() function can be used to specif y a callout function for | |||
pcre2_substitute(). This information is passed in a match cont ext. The callout function is | pcre2_substitute(). This information is passed in a match cont ext. The callout function is | |||
called after each substitution has been processed, but it can cause the replacement not to hap‐ | called after each substitution has been processed, but it can cause the replacement not to hap‐ | |||
pen. The callout function is not called for simulated substitution | pen. | |||
s that happen as a result of | ||||
the PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. | The callout function is not called for simulated substitutions th | |||
at happen as a result of the | ||||
PCRE2_SUBSTITUTE_OVERFLOW_LENGTH option. In this mode, when substitu | ||||
tion processing exceeds the | ||||
buffer space provided by the caller, processing continues by countin | ||||
g code units. The simulation | ||||
is unable to populate the callout block, and so the simulation is pe | ||||
ssimistic about the required | ||||
buffer size. Whichever is larger of accepted or rejected substit | ||||
ution is reported as the re‐ | ||||
quired size. Therefore, the returned buffer length may be an overest | ||||
imate (without a substitu‐ | ||||
tion callout, it is normally an exact measurement). | ||||
The first argument of the callout function is a pointer to a substit ute callout block structure, | The first argument of the callout function is a pointer to a substit ute callout block structure, | |||
which contains the following fields, not necessarily in this order: | which contains the following fields, not necessarily in this order: | |||
uint32_t version; | uint32_t version; | |||
uint32_t subscount; | uint32_t subscount; | |||
PCRE2_SPTR input; | PCRE2_SPTR input; | |||
PCRE2_SPTR output; | PCRE2_SPTR output; | |||
PCRE2_SIZE *ovector; | PCRE2_SIZE *ovector; | |||
uint32_t oveccount; | uint32_t oveccount; | |||
PCRE2_SIZE output_offsets[2]; | PCRE2_SIZE output_offsets[2]; | |||
The version field contains the version number of the block format. T he current version is 0. The | The version field contains the version number of the block format. T he current version is 0. The | |||
version number will increase in future if more fields are added, but the intention is never to | version number will increase in future if more fields are added, b ut the intention is never to | |||
remove any of the existing fields. | remove any of the existing fields. | |||
The subscount field is the number of the current match. It is 1 for the first callout, 2 for the | The subscount field is the number of the current match. It is 1 for the first callout, 2 for the | |||
second, and so on. The input and output pointers are copies of the values passed to pcre2_sub‐ | second, and so on. The input and output pointers are copies of the v alues passed to pcre2_sub‐ | |||
stitute(). | stitute(). | |||
The ovector field points to the ovector, which contains the result o f the most recent match. The | The ovector field points to the ovector, which contains the result o f the most recent match. The | |||
oveccount field contains the number of pairs that are set in the ove ctor, and is always greater | oveccount field contains the number of pairs that are set in the ov ector, and is always greater | |||
than zero. | than zero. | |||
The output_offsets vector contains the offsets of the replacement in the output string. This has | The output_offsets vector contains the offsets of the replacement in the output string. This has | |||
already been processed for dollar and (if requested) backslash subst itutions as described above. | already been processed for dollar and (if requested) backslash subst itutions as described above. | |||
The second argument of the callout function is the value passed as callout_data when the func‐ | The second argument of the callout function is the value passed as c allout_data when the func‐ | |||
tion was registered. The value returned by the callout function is i nterpreted as follows: | tion was registered. The value returned by the callout function is i nterpreted as follows: | |||
If the value is zero, the replacement is accepted, and, if PCRE2_SUB | If the value is zero, the replacement is accepted, and, if PCRE2_SU | |||
STITUTE_GLOBAL is set, pro‐ | BSTITUTE_GLOBAL is set, pro‐ | |||
cessing continues with a search for the next match. If the value | cessing continues with a search for the next match. If the value is | |||
is not zero, the current re‐ | not zero, the current re‐ | |||
placement is not accepted. If the value is greater than zero, | placement is not accepted. If the value is greater than zer | |||
processing continues when | o, processing continues when | |||
PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less | PCRE2_SUBSTITUTE_GLOBAL is set. Otherwise (the value is less t | |||
than zero or PCRE2_SUBSTI‐ | han zero or PCRE2_SUBSTI‐ | |||
TUTE_GLOBAL is not set), the rest of the input is copied to the | TUTE_GLOBAL is not set), the rest of the input is copied t | |||
output and the call to | o the output and the call to | |||
pcre2_substitute() exits, returning the number of matches so far. | pcre2_substitute() exits, returning the number of matches so far. | |||
Substitution case callouts | ||||
int pcre2_set_substitute_case_callout(pcre2_match_context *mcontext, | ||||
PCRE2_SIZE (*callout_function)(PCRE2_SPTR, PCRE2_SIZE, | ||||
PCRE2_UCHAR *, PCRE2_SIZE, | ||||
int, void *), | ||||
void *callout_data); | ||||
The pcre2_set_substitution_case_callout() function can be used to sp | ||||
ecify a callout function for | ||||
pcre2_substitute() to use when performing case transformations. This | ||||
does not affect any case | ||||
insensitivity behaviour when performing a match, but only the user- | ||||
visible transformations per‐ | ||||
formed when processing a substitution such as: | ||||
pcre2_substitute(..., "\\U$1", ...) | ||||
The default case transformations applied by PCRE2 are reasonably com | ||||
plete, and, in UTF or UCP | ||||
mode, perform the simple locale-invariant case transformations as s | ||||
pecified by Unicode. This is | ||||
suitable for the internal (invisible) case-equivalence procedures us | ||||
ed during pattern matching, | ||||
but an application may wish to use more sophisticated locale-aware p | ||||
rocessing for the user-visi‐ | ||||
ble substitution transformations. | ||||
One example implementation of the callout_function using the ICU lib | ||||
rary would be: | ||||
PCRE2_SIZE | ||||
icu_case_callout( | ||||
PCRE2_SPTR input, PCRE2_SIZE input_len, | ||||
PCRE2_UCHAR *output, PCRE2_SIZE output_cap, | ||||
int to_case, void *data_ptr) | ||||
{ | ||||
UErrorCode err = U_ZERO_ERROR; | ||||
int32_t r = to_case == PCRE2_SUBSTITUTE_CASE_LOWER | ||||
? u_strToLower(output, output_cap, input, input_len, NULL, & | ||||
err) | ||||
: to_case == PCRE2_SUBSTITUTE_CASE_UPPER | ||||
? u_strToUpper(output, output_cap, input, input_len, NULL, & | ||||
err) | ||||
: u_strToTitle(output, output_cap, input, input_len, &first_ | ||||
char_only, | ||||
NULL, &err); | ||||
if (U_FAILURE(err)) return (~(PCRE2_SIZE)0); | ||||
return r; | ||||
} | ||||
The first and second arguments of the case callout function are the | ||||
Unicode string to transform. | ||||
The third and fourth arguments are the output buffer and its capacit | ||||
y. | ||||
The fifth is one of the constants PCRE2_SUBSTITUTE_CASE_LOWER, PCR | ||||
E2_SUBSTITUTE_CASE_UPPER, or | ||||
PCRE2_SUBSTITUTE_CASE_TITLE_FIRST. PCRE2_SUBSTITUTE_CASE_LOWER and | ||||
PCRE2_SUBSTITUTE_CASE_UPPER | ||||
are passed to the callout to indicate that the case of the entire c | ||||
allout input should be case- | ||||
transformed. PCRE2_SUBSTITUTE_CASE_TITLE_FIRST is passed to indicate | ||||
that only the first charac‐ | ||||
ter or glyph should be transformed to Unicode titlecase and the rest | ||||
to Unicode lowercase (note | ||||
that titlecasing sometimes uses Unicode properties to titlecase each | ||||
word in a string; but PCRE2 | ||||
is requesting that only the single leading character is to be titlec | ||||
ased). | ||||
The sixth argument is the callout_data supplied to pcre2_set_substit | ||||
ute_case_callout(). | ||||
The resulting string in the destination buffer may be larger or sm | ||||
aller than the input, if the | ||||
casing rules merge or split characters. The return value is the leng | ||||
th required for the output | ||||
string. If a buffer of sufficient size was provided to the call | ||||
out, then the result must be | ||||
written to the buffer and the number of code units returned. If the | ||||
result does not fit in the | ||||
provided buffer, then the required capacity must be returned and PC | ||||
RE2 will not make use of the | ||||
output buffer. PCRE2 provides input and output buffers which overlap | ||||
, so the callout must sup‐ | ||||
port this by suitable internal buffering. | ||||
Alternatively, if the callout wishes to indicate an error, then it | ||||
may return (~(PCRE2_SIZE)0). | ||||
In this case pcre2_substitute() will immediately fail with error PCR | ||||
E2_ERROR_REPLACECASE. | ||||
When a case callout is combined with the PCRE2_SUBSTITUTE_OVERFLOW_L | ||||
ENGTH option, there are sit‐ | ||||
uations when pcre2_substitute() will return an underestimate of the | ||||
required buffer size. If you | ||||
call pcre2_substitute() once with PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, | ||||
and the input buffer is too | ||||
small for the replacement string to be constructed, then instead | ||||
of calling the case callout, | ||||
pcre2_substitute() will make an estimate of the required buffer size | ||||
. The second call should | ||||
also pass PCRE2_SUBSTITUTE_OVERFLOW_LENGTH, because that second ca | ||||
ll is not guaranteed to suc‐ | ||||
ceed either, if the case callout requires more buffer space than exp | ||||
ected. The caller must make | ||||
repeated attempts in a loop. | ||||
DUPLICATE CAPTURE GROUP NAMES | DUPLICATE CAPTURE GROUP NAMES | |||
int pcre2_substring_nametable_scan(const pcre2_code *code, | int pcre2_substring_nametable_scan(const pcre2_code *code, | |||
PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); | PCRE2_SPTR name, PCRE2_SPTR *first, PCRE2_SPTR *last); | |||
When a pattern is compiled with the PCRE2_DUPNAMES option, names fo r capture groups are not re‐ | When a pattern is compiled with the PCRE2_DUPNAMES option, names fo r capture groups are not re‐ | |||
quired to be unique. Duplicate names are always allowed for groups w ith the same number, created | quired to be unique. Duplicate names are always allowed for groups w ith the same number, created | |||
by using the (?| feature. Indeed, if such groups are named, they are required to use the same | by using the (?| feature. Indeed, if such groups are named, they are required to use the same | |||
names. | names. | |||
skipping to change at line 3213 | skipping to change at line 3450 | |||
pcre2sample(3), pcre2unicode(3). | pcre2sample(3), pcre2unicode(3). | |||
AUTHOR | AUTHOR | |||
Philip Hazel | Philip Hazel | |||
Retired from University Computing Service | Retired from University Computing Service | |||
Cambridge, England. | Cambridge, England. | |||
REVISION | REVISION | |||
Last updated: 24 April 2024 | Last updated: 26 December 2024 | |||
Copyright (c) 1997-2024 University of Cambridge. | Copyright (c) 1997-2024 University of Cambridge. | |||
PCRE2 10.44 24 April 2024 PCRE2API(3) | PCRE2 10.45-RC1 26 December 2024 PCRE2API(3) | |||
End of changes. 135 change blocks. | ||||
526 lines changed or deleted | 874 lines changed or added | |||
This html diff was produced by rfcdiff 1.41. The latest version is available from http://tools.ietf.org/tools/rfcdiff/ |