mirror of https://github.com/postgres/postgres
Tag:
Branch:
Tree:
1d369c9e90
REL2_0B
REL6_4
REL6_5_PATCHES
REL7_0_PATCHES
REL7_1_STABLE
REL7_2_STABLE
REL7_3_STABLE
REL7_4_STABLE
REL8_0_STABLE
REL8_1_STABLE
REL8_2_STABLE
REL8_3_STABLE
REL8_4_STABLE
REL8_5_ALPHA1_BRANCH
REL8_5_ALPHA2_BRANCH
REL8_5_ALPHA3_BRANCH
REL9_0_ALPHA4_BRANCH
REL9_0_ALPHA5_BRANCH
REL9_0_STABLE
REL9_1_STABLE
REL9_2_STABLE
REL9_3_STABLE
REL9_4_STABLE
REL9_5_STABLE
REL9_6_STABLE
REL_10_STABLE
REL_11_STABLE
REL_12_STABLE
REL_13_STABLE
REL_14_STABLE
REL_15_STABLE
REL_16_STABLE
REL_17_STABLE
REL_18_STABLE
Release_1_0_3
WIN32_DEV
ecpg_big_bison
master
PG95-1_01
PG95-1_08
PG95-1_09
REL2_0
REL6_1
REL6_1_1
REL6_2
REL6_2_1
REL6_3
REL6_3_2
REL6_4_2
REL6_5
REL6_5_1
REL6_5_2
REL6_5_3
REL7_0
REL7_0_2
REL7_0_3
REL7_1
REL7_1_1
REL7_1_2
REL7_1_3
REL7_1_BETA
REL7_1_BETA2
REL7_1_BETA3
REL7_2
REL7_2_1
REL7_2_2
REL7_2_3
REL7_2_4
REL7_2_5
REL7_2_6
REL7_2_7
REL7_2_8
REL7_2_BETA1
REL7_2_BETA2
REL7_2_BETA3
REL7_2_BETA4
REL7_2_BETA5
REL7_2_RC1
REL7_2_RC2
REL7_3
REL7_3_1
REL7_3_10
REL7_3_11
REL7_3_12
REL7_3_13
REL7_3_14
REL7_3_15
REL7_3_16
REL7_3_17
REL7_3_18
REL7_3_19
REL7_3_2
REL7_3_20
REL7_3_21
REL7_3_3
REL7_3_4
REL7_3_5
REL7_3_6
REL7_3_7
REL7_3_8
REL7_3_9
REL7_4
REL7_4_1
REL7_4_10
REL7_4_11
REL7_4_12
REL7_4_13
REL7_4_14
REL7_4_15
REL7_4_16
REL7_4_17
REL7_4_18
REL7_4_19
REL7_4_2
REL7_4_20
REL7_4_21
REL7_4_22
REL7_4_23
REL7_4_24
REL7_4_25
REL7_4_26
REL7_4_27
REL7_4_28
REL7_4_29
REL7_4_3
REL7_4_30
REL7_4_4
REL7_4_5
REL7_4_6
REL7_4_7
REL7_4_8
REL7_4_9
REL7_4_BETA1
REL7_4_BETA2
REL7_4_BETA3
REL7_4_BETA4
REL7_4_BETA5
REL7_4_RC1
REL7_4_RC2
REL8_0_0
REL8_0_0BETA1
REL8_0_0BETA2
REL8_0_0BETA3
REL8_0_0BETA4
REL8_0_0BETA5
REL8_0_0RC1
REL8_0_0RC2
REL8_0_0RC3
REL8_0_0RC4
REL8_0_0RC5
REL8_0_1
REL8_0_10
REL8_0_11
REL8_0_12
REL8_0_13
REL8_0_14
REL8_0_15
REL8_0_16
REL8_0_17
REL8_0_18
REL8_0_19
REL8_0_2
REL8_0_20
REL8_0_21
REL8_0_22
REL8_0_23
REL8_0_24
REL8_0_25
REL8_0_26
REL8_0_3
REL8_0_4
REL8_0_5
REL8_0_6
REL8_0_7
REL8_0_8
REL8_0_9
REL8_1_0
REL8_1_0BETA1
REL8_1_0BETA2
REL8_1_0BETA3
REL8_1_0BETA4
REL8_1_0RC1
REL8_1_1
REL8_1_10
REL8_1_11
REL8_1_12
REL8_1_13
REL8_1_14
REL8_1_15
REL8_1_16
REL8_1_17
REL8_1_18
REL8_1_19
REL8_1_2
REL8_1_20
REL8_1_21
REL8_1_22
REL8_1_23
REL8_1_3
REL8_1_4
REL8_1_5
REL8_1_6
REL8_1_7
REL8_1_8
REL8_1_9
REL8_2_0
REL8_2_1
REL8_2_10
REL8_2_11
REL8_2_12
REL8_2_13
REL8_2_14
REL8_2_15
REL8_2_16
REL8_2_17
REL8_2_18
REL8_2_19
REL8_2_2
REL8_2_20
REL8_2_21
REL8_2_22
REL8_2_23
REL8_2_3
REL8_2_4
REL8_2_5
REL8_2_6
REL8_2_7
REL8_2_8
REL8_2_9
REL8_2_BETA1
REL8_2_BETA2
REL8_2_BETA3
REL8_2_RC1
REL8_3_0
REL8_3_1
REL8_3_10
REL8_3_11
REL8_3_12
REL8_3_13
REL8_3_14
REL8_3_15
REL8_3_16
REL8_3_17
REL8_3_18
REL8_3_19
REL8_3_2
REL8_3_20
REL8_3_21
REL8_3_22
REL8_3_23
REL8_3_3
REL8_3_4
REL8_3_5
REL8_3_6
REL8_3_7
REL8_3_8
REL8_3_9
REL8_3_BETA1
REL8_3_BETA2
REL8_3_BETA3
REL8_3_BETA4
REL8_3_RC1
REL8_3_RC2
REL8_4_0
REL8_4_1
REL8_4_10
REL8_4_11
REL8_4_12
REL8_4_13
REL8_4_14
REL8_4_15
REL8_4_16
REL8_4_17
REL8_4_18
REL8_4_19
REL8_4_2
REL8_4_20
REL8_4_21
REL8_4_22
REL8_4_3
REL8_4_4
REL8_4_5
REL8_4_6
REL8_4_7
REL8_4_8
REL8_4_9
REL8_4_BETA1
REL8_4_BETA2
REL8_4_RC1
REL8_4_RC2
REL8_5_ALPHA1
REL8_5_ALPHA2
REL8_5_ALPHA3
REL9_0_0
REL9_0_1
REL9_0_10
REL9_0_11
REL9_0_12
REL9_0_13
REL9_0_14
REL9_0_15
REL9_0_16
REL9_0_17
REL9_0_18
REL9_0_19
REL9_0_2
REL9_0_20
REL9_0_21
REL9_0_22
REL9_0_23
REL9_0_3
REL9_0_4
REL9_0_5
REL9_0_6
REL9_0_7
REL9_0_8
REL9_0_9
REL9_0_ALPHA4
REL9_0_ALPHA5
REL9_0_BETA1
REL9_0_BETA2
REL9_0_BETA3
REL9_0_BETA4
REL9_0_RC1
REL9_1_0
REL9_1_1
REL9_1_10
REL9_1_11
REL9_1_12
REL9_1_13
REL9_1_14
REL9_1_15
REL9_1_16
REL9_1_17
REL9_1_18
REL9_1_19
REL9_1_2
REL9_1_20
REL9_1_21
REL9_1_22
REL9_1_23
REL9_1_24
REL9_1_3
REL9_1_4
REL9_1_5
REL9_1_6
REL9_1_7
REL9_1_8
REL9_1_9
REL9_1_ALPHA1
REL9_1_ALPHA2
REL9_1_ALPHA3
REL9_1_ALPHA4
REL9_1_ALPHA5
REL9_1_BETA1
REL9_1_BETA2
REL9_1_BETA3
REL9_1_RC1
REL9_2_0
REL9_2_1
REL9_2_10
REL9_2_11
REL9_2_12
REL9_2_13
REL9_2_14
REL9_2_15
REL9_2_16
REL9_2_17
REL9_2_18
REL9_2_19
REL9_2_2
REL9_2_20
REL9_2_21
REL9_2_22
REL9_2_23
REL9_2_24
REL9_2_3
REL9_2_4
REL9_2_5
REL9_2_6
REL9_2_7
REL9_2_8
REL9_2_9
REL9_2_BETA1
REL9_2_BETA2
REL9_2_BETA3
REL9_2_BETA4
REL9_2_RC1
REL9_3_0
REL9_3_1
REL9_3_10
REL9_3_11
REL9_3_12
REL9_3_13
REL9_3_14
REL9_3_15
REL9_3_16
REL9_3_17
REL9_3_18
REL9_3_19
REL9_3_2
REL9_3_20
REL9_3_21
REL9_3_22
REL9_3_23
REL9_3_24
REL9_3_25
REL9_3_3
REL9_3_4
REL9_3_5
REL9_3_6
REL9_3_7
REL9_3_8
REL9_3_9
REL9_3_BETA1
REL9_3_BETA2
REL9_3_RC1
REL9_4_0
REL9_4_1
REL9_4_10
REL9_4_11
REL9_4_12
REL9_4_13
REL9_4_14
REL9_4_15
REL9_4_16
REL9_4_17
REL9_4_18
REL9_4_19
REL9_4_2
REL9_4_20
REL9_4_21
REL9_4_22
REL9_4_23
REL9_4_24
REL9_4_25
REL9_4_26
REL9_4_3
REL9_4_4
REL9_4_5
REL9_4_6
REL9_4_7
REL9_4_8
REL9_4_9
REL9_4_BETA1
REL9_4_BETA2
REL9_4_BETA3
REL9_4_RC1
REL9_5_0
REL9_5_1
REL9_5_10
REL9_5_11
REL9_5_12
REL9_5_13
REL9_5_14
REL9_5_15
REL9_5_16
REL9_5_17
REL9_5_18
REL9_5_19
REL9_5_2
REL9_5_20
REL9_5_21
REL9_5_22
REL9_5_23
REL9_5_24
REL9_5_25
REL9_5_3
REL9_5_4
REL9_5_5
REL9_5_6
REL9_5_7
REL9_5_8
REL9_5_9
REL9_5_ALPHA1
REL9_5_ALPHA2
REL9_5_BETA1
REL9_5_BETA2
REL9_5_RC1
REL9_6_0
REL9_6_1
REL9_6_10
REL9_6_11
REL9_6_12
REL9_6_13
REL9_6_14
REL9_6_15
REL9_6_16
REL9_6_17
REL9_6_18
REL9_6_19
REL9_6_2
REL9_6_20
REL9_6_21
REL9_6_22
REL9_6_23
REL9_6_24
REL9_6_3
REL9_6_4
REL9_6_5
REL9_6_6
REL9_6_7
REL9_6_8
REL9_6_9
REL9_6_BETA1
REL9_6_BETA2
REL9_6_BETA3
REL9_6_BETA4
REL9_6_RC1
REL_10_0
REL_10_1
REL_10_10
REL_10_11
REL_10_12
REL_10_13
REL_10_14
REL_10_15
REL_10_16
REL_10_17
REL_10_18
REL_10_19
REL_10_2
REL_10_20
REL_10_21
REL_10_22
REL_10_23
REL_10_3
REL_10_4
REL_10_5
REL_10_6
REL_10_7
REL_10_8
REL_10_9
REL_10_BETA1
REL_10_BETA2
REL_10_BETA3
REL_10_BETA4
REL_10_RC1
REL_11_0
REL_11_1
REL_11_10
REL_11_11
REL_11_12
REL_11_13
REL_11_14
REL_11_15
REL_11_16
REL_11_17
REL_11_18
REL_11_19
REL_11_2
REL_11_20
REL_11_21
REL_11_22
REL_11_3
REL_11_4
REL_11_5
REL_11_6
REL_11_7
REL_11_8
REL_11_9
REL_11_BETA1
REL_11_BETA2
REL_11_BETA3
REL_11_BETA4
REL_11_RC1
REL_12_0
REL_12_1
REL_12_10
REL_12_11
REL_12_12
REL_12_13
REL_12_14
REL_12_15
REL_12_16
REL_12_17
REL_12_18
REL_12_19
REL_12_2
REL_12_20
REL_12_21
REL_12_22
REL_12_3
REL_12_4
REL_12_5
REL_12_6
REL_12_7
REL_12_8
REL_12_9
REL_12_BETA1
REL_12_BETA2
REL_12_BETA3
REL_12_BETA4
REL_12_RC1
REL_13_0
REL_13_1
REL_13_10
REL_13_11
REL_13_12
REL_13_13
REL_13_14
REL_13_15
REL_13_16
REL_13_17
REL_13_18
REL_13_19
REL_13_2
REL_13_20
REL_13_21
REL_13_22
REL_13_23
REL_13_3
REL_13_4
REL_13_5
REL_13_6
REL_13_7
REL_13_8
REL_13_9
REL_13_BETA1
REL_13_BETA2
REL_13_BETA3
REL_13_RC1
REL_14_0
REL_14_1
REL_14_10
REL_14_11
REL_14_12
REL_14_13
REL_14_14
REL_14_15
REL_14_16
REL_14_17
REL_14_18
REL_14_19
REL_14_2
REL_14_20
REL_14_3
REL_14_4
REL_14_5
REL_14_6
REL_14_7
REL_14_8
REL_14_9
REL_14_BETA1
REL_14_BETA2
REL_14_BETA3
REL_14_RC1
REL_15_0
REL_15_1
REL_15_10
REL_15_11
REL_15_12
REL_15_13
REL_15_14
REL_15_15
REL_15_2
REL_15_3
REL_15_4
REL_15_5
REL_15_6
REL_15_7
REL_15_8
REL_15_9
REL_15_BETA1
REL_15_BETA2
REL_15_BETA3
REL_15_BETA4
REL_15_RC1
REL_15_RC2
REL_16_0
REL_16_1
REL_16_10
REL_16_11
REL_16_2
REL_16_3
REL_16_4
REL_16_5
REL_16_6
REL_16_7
REL_16_8
REL_16_9
REL_16_BETA1
REL_16_BETA2
REL_16_BETA3
REL_16_RC1
REL_17_0
REL_17_1
REL_17_2
REL_17_3
REL_17_4
REL_17_5
REL_17_6
REL_17_7
REL_17_BETA1
REL_17_BETA2
REL_17_BETA3
REL_17_RC1
REL_18_0
REL_18_1
REL_18_BETA1
REL_18_BETA2
REL_18_BETA3
REL_18_RC1
Release_1_0_2
Release_2_0
Release_2_0_0
release-6-3
${ noResults }
210 Commits (1d369c9e90f311ec98b07a259cac48c404c773d5)
| Author | SHA1 | Message | Date |
|---|---|---|---|
|
|
c04c6c5d6f |
Avoid character classification in regex escape parsing.
For regex escape sequences, just test directly for the relevant ASCII characters rather than using locale-sensitive character classification. This fixes an assertion failure when a locale considers a non-ASCII character, such as "൧", to be a digit. Reported-by: Richard Guo Discussion: https://postgr.es/m/CAMbWs49Q6UoKGeT8pBkMtJGJd+16CBFZaaWUk9Du+2ERE5g_YA@mail.gmail.com Backpatch-through: 11 |
3 years ago |
|
|
db4f21e4a3 |
Redesign interrupt/cancel API for regex engine.
Previously, a PostgreSQL-specific callback checked by the regex engine had a way to trigger a special error code REG_CANCEL if it detected that the next call to CHECK_FOR_INTERRUPTS() would certainly throw via ereport(). A later proposed bugfix aims to move some complex logic out of signal handlers, so that it won't run until the next CHECK_FOR_INTERRUPTS(), which makes the above design impossible unless we split CHECK_FOR_INTERRUPTS() into two phases, one to run logic and another to ereport(). We may develop such a system in the future, but for the regex code it is no longer necessary. An earlier commit moved regex memory management over to our MemoryContext system. Given that the purpose of the two-phase interrupt checking was to free memory before throwing, something we don't need to worry about anymore, it seems simpler to inject CHECK_FOR_INTERRUPTS() directly into cancelation points, and just let it throw. Since the plan is to keep PostgreSQL-specific concerns separate from the main regex engine code (with a view to bein able to stay in sync with other projects), do this with a new macro INTERRUPT(), customizable in regcustom.h and defaulting to nothing. Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/CA%2BhUKGK3PGKwcKqzoosamn36YW-fsuTdOPPF1i_rtEO%3DnEYKSg%40mail.gmail.com |
3 years ago |
|
|
bea3d7e383 |
Use MemoryContext API for regex memory management.
Previously, regex_t objects' memory was managed with malloc() and free() directly. Switch to palloc()-based memory management instead. Advantages: * memory used by cached regexes is now visible with MemoryContext observability tools * cleanup can be done automatically in certain failure modes (something that later commits will take advantage of) * cleanup can be done in bulk On the downside, there may be more fragmentation (wasted memory) due to per-regex MemoryContext objects. This is a problem shared with other cached objects in PostgreSQL and can probably be improved with later tuning. Thanks to Noah Misch for suggesting this general approach, which unblocks later work on interrupts. Suggested-by: Noah Misch <noah@leadboat.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/CA%2BhUKGK3PGKwcKqzoosamn36YW-fsuTdOPPF1i_rtEO%3DnEYKSg%40mail.gmail.com |
3 years ago |
|
|
6974a8f768 |
Refactor to introduce pg_locale_deterministic().
Avoids the need of callers to test for NULL, and also avoids the need to access the pg_locale_t structure directly. Reviewed-by: Peter Eisentraut, Peter Geoghegan Discussion: https://postgr.es/m/a581136455c940d7bd0ff482d3a2bd51af25a94f.camel%40j-davis.com |
3 years ago |
|
|
c8e1ba736b |
Update copyright for 2023
Backpatch-through: 11 |
3 years ago |
|
|
8284cf5f74 |
Add copyright notices to meson files
Discussion: https://postgr.es/m/222b43a5-2fb3-2c1b-9cd0-375d376c8246@dunslane.net |
3 years ago |
|
|
03bf971d2d |
Remove uses of register due to incompatibility with C++17 and up
The use in regexec.c could remain, since we only try to keep headers C++ clean. But there really doesn't seem to be a good reason to use register in that spot. Discussion: https://postgr.es/m/20220308185902.ibdqmasoaunzjrfc@alap3.anarazel.de |
3 years ago |
|
|
e6927270cd |
meson: Add initial version of meson based build system
Autoconf is showing its age, fewer and fewer contributors know how to wrangle it. Recursive make has a lot of hard to resolve dependency issues and slow incremental rebuilds. Our home-grown MSVC build system is hard to maintain for developers not using Windows and runs tests serially. While these and other issues could individually be addressed with incremental improvements, together they seem best addressed by moving to a more modern build system. After evaluating different build system choices, we chose to use meson, to a good degree based on the adoption by other open source projects. We decided that it's more realistic to commit a relatively early version of the new build system and mature it in tree. This commit adds an initial version of a meson based build system. It supports building postgres on at least AIX, FreeBSD, Linux, macOS, NetBSD, OpenBSD, Solaris and Windows (however only gcc is supported on aix, solaris). For Windows/MSVC postgres can now be built with ninja (faster, particularly for incremental builds) and msbuild (supporting the visual studio GUI, but building slower). Several aspects (e.g. Windows rc file generation, PGXS compatibility, LLVM bitcode generation, documentation adjustments) are done in subsequent commits requiring further review. Other aspects (e.g. not installing test-only extensions) are not yet addressed. When building on Windows with msbuild, builds are slower when using a visual studio version older than 2019, because those versions do not support MultiToolTask, required by meson for intra-target parallelism. The plan is to remove the MSVC specific build system in src/tools/msvc soon after reaching feature parity. However, we're not planning to remove the autoconf/make build system in the near future. Likely we're going to keep at least the parts required for PGXS to keep working around until all supported versions build with meson. Some initial help for postgres developers is at https://wiki.postgresql.org/wiki/Meson With contributions from Thomas Munro, John Naylor, Stone Tickle and others. Author: Andres Freund <andres@anarazel.de> Author: Nazir Bilal Yavuz <byavuz81@gmail.com> Author: Peter Eisentraut <peter@eisentraut.org> Reviewed-By: Peter Eisentraut <peter.eisentraut@enterprisedb.com> Discussion: https://postgr.es/m/20211012083721.hvixq4pnh2pixr3j@alap3.anarazel.de |
3 years ago |
|
|
cb8ff7ed5a |
Consistently use named parameters in regex code consistently.
Adjust a handful of remaining function prototypes that were overlooked
by recent commit
|
3 years ago |
|
|
bc2187ed63 |
Consistently use named parameters in regex code.
Make regex code consistently use named parameters in function declarations. Also make sure that parameter names from each function's declaration match corresponding definition parameter names. This makes Henry Spencer's regex code follow Postgres coding standards. Author: Peter Geoghegan <pg@bowt.ie> Reviewed-By: David Rowley <dgrowleyml@gmail.com> Discussion: https://postgr.es/m/CAH2-WznJt9CMM9KJTMjJh_zbL5hD9oX44qdJ4aqZtjFi-zA3Tg@mail.gmail.com |
3 years ago |
|
|
f25bed3801 |
Defend against stack overrun in a few more places.
SplitToVariants() in the ispell code, lseg_inside_poly() in geo_ops.c, and regex_selectivity_sub() in selectivity estimation could recurse until stack overflow; fix by adding check_stack_depth() calls. So could next() in the regex compiler, but that case is better fixed by converting its tail recursion to a loop. (We probably get better code that way too, since next() can now be inlined into its sole caller.) There remains a reachable stack overrun in the Turkish stemmer, but we'll need some advice from the Snowball people about how to fix that. Per report from Egor Chindyaskin and Alexander Lakhin. These mistakes are old, so back-patch to all supported branches. Richard Guo and Tom Lane Discussion: https://postgr.es/m/1661334672.728714027@f473.i.mail.ru |
3 years ago |
|
|
02c408e21a |
Remove redundant null pointer checks before free()
Per applicable standards, free() with a null pointer is a no-op. Systems that don't observe that are ancient and no longer relevant. Some PostgreSQL code already required this behavior, so this change does not introduce any new requirements, just makes the code more consistent. Discussion: https://www.postgresql.org/message-id/flat/dac5d2d0-98f5-94d9-8e69-46da2413593d%40enterprisedb.com |
4 years ago |
|
|
23e7b38bfe |
Pre-beta mechanical code beautification.
Run pgindent, pgperltidy, and reformat-dat-files. I manually fixed a couple of comments that pgindent uglified. |
4 years ago |
|
|
b99ccd2cb2 |
Call pg_newlocale_from_collation() also with default collation
Previously, callers of pg_newlocale_from_collation() did not call it if the collation was DEFAULT_COLLATION_OID and instead proceeded with a pg_locale_t of 0. Instead, now we call it anyway and have it return 0 if the default collation was passed. It already did this, so we just have to adjust the callers. This simplifies all the call sites and also makes future enhancements easier. After discussion and testing, the previous comment in pg_locale.c about avoiding this for performance reasons may have been mistaken since it was testing a very different patch version way back when. Reviewed-by: Julien Rouhaud <rjuju123@gmail.com> Discussion: https://www.postgresql.org/message-id/ed3baa81-7fac-7788-cc12-41e3f7917e34@enterprisedb.com |
4 years ago |
|
|
27b77ecf9f |
Update copyright for 2022
Backpatch-through: 10 |
4 years ago |
|
|
e757080e04 |
Make pg_regexec() robust against out-of-range search_start.
If search_start is greater than the length of the string, we should just return REG_NOMATCH immediately. (Note that the equality case should *not* be rejected, since the pattern might be able to match zero characters.) This guards various internal assumptions that the min of a range of string positions is not more than the max. Violation of those assumptions could allow an attempt to fetch string[search_start-1], possibly causing a crash. Jaime Casanova pointed out that this situation is reachable with the new regexp_xxx functions that accept a user-specified start position. I don't believe it's reachable via any in-core call site in v14 and below. However, extensions could possibly call pg_regexec with an out-of-range search_start, so let's back-patch the fix anyway. Discussion: https://postgr.es/m/20210911180357.GA6870@ahch-to |
4 years ago |
|
|
10d58228bb |
Doc: add a little about LACON execution to src/backend/regex/README.
I wrote this while thinking about a possible optimization, but it's a useful description of the existing code regardless of whether the optimization ever happens. So push it separately. |
4 years ago |
|
|
8f72becd6b |
Handle interaction of regexp's makesearch and MATCHALL more honestly.
Second thoughts about commit 824bf7190: we apply makesearch() to an NFA after having determined whether it is a MATCHALL pattern. Prepending ".*" doesn't make it non-MATCHALL, but it does change the maximum possible match length, and makesearch() failed to update that. This has no ill effects given the stylized usage of search NFAs, but it seems like it's better to keep the data structure consistent. In particular, fixing this allows more honest handling of the MATCHALL check in matchuntil(): we can now assert that maxmatchall is infinity, instead of lamely assuming that it should act that way. In passing, improve the code in dump[c]nfa so that infinite maxmatchall is printed as "inf" not a magic number. |
4 years ago |
|
|
65dc30ced6 |
Fix regexp misbehavior with capturing parens inside "{0}".
Regexps like "(.){0}...\1" drew an "invalid backreference number".
That's not unreasonable on its face, since the capture group will
never be matched if it's iterated zero times. However, other engines
such as Perl's don't complain about this, nor do we throw an error for
related cases such as "(.)|\1", even though that backref can never
succeed either. Also, if the zero-iterations case happens at runtime
rather than compile time --- say, "(x)*...\1" when there's no "x" to
be found --- that's not an error, we just deem the backref to not
match. Making this even less defensible, no error was thrown for
nested cases such as "((.)){0}...\2"; and to add insult to injury,
those cases could result in assertion failures instead. (It seems
that nothing especially bad happened in non-assert builds, though.)
Let's just fix it so that no error is thrown and instead the backref
is deemed to never match, so that compile-time detection of no
iterations behaves the same as run-time detection.
Per report from Mark Dilger. This appears to be an aboriginal error
in Spencer's library, so back-patch to all supported versions.
Pre-v14, it turns out to also be necessary to back-patch one aspect of
commits cb76fbd7e/00116dee5, namely to create capture-node subREs with
the begin/end states of their subexpressions, not the current lp/rp
of the outer parseqatom invocation. Otherwise delsub complains that
we're trying to disconnect a state from itself. This is a bit scary
but code examination shows that it's safe: in the pre-v14 code, if we
want to wrap iteration around the subexpression, the first thing we do
is overwrite the atom's begin/end fields with new states. So the
bogus values didn't survive long enough to be used for anything, except
if no iteration is required, in which case it doesn't matter.
Discussion: https://postgr.es/m/A099E4A8-4377-4C64-A98C-3DEDDC075502@enterprisedb.com
|
4 years ago |
|
|
9bbf6f7341 |
Prevent regexp back-refs from sometimes matching when they shouldn't.
The recursion in cdissect() was careless about clearing match data for capturing parentheses after rejecting a partial match. This could allow a later back-reference to succeed when by rights it should fail for lack of a defined referent. To fix, think a little more rigorously about what the contract between different levels of cdissect's recursion needs to be. With the right spec, we can fix this using fewer rather than more resets of the match data; the key decision being that a failed sub-match is now explicitly responsible for clearing any matches it may have set. There are enough other cross-checks and optimizations in the code that it's not especially easy to exhibit this problem; usually, the match will fail as-expected. Plus, regexps that are even potentially vulnerable are most likely user errors, since there's just not much point in writing a back-ref that doesn't always have a referent. These facts perhaps explain why the issue hasn't been detected, even though it's almost certainly a couple of decades old. Discussion: https://postgr.es/m/151435.1629733387@sss.pgh.pa.us |
4 years ago |
|
|
facce1da91 |
Fix performance bug in regexp's citerdissect/creviterdissect.
After detecting a sub-match "dissect" failure (i.e., a backref match
failure) in the i'th sub-match of an iteration node, we should proceed
by adjusting the attempted length of the i'th submatch. As coded,
though, these functions changed the attempted length of the *last*
sub-match, and only after exhausting all possibilities for that would
they back up to adjust the next-to-last sub-match, and then the
second-from-last, etc; all of which is wasted effort, since only
changing the start or length of the i'th sub-match can possibly make
it succeed. This oversight creates the possibility for exponentially
bad performance. Fortunately the problem is masked in most cases by
optimizations or constraints applied elsewhere; which explains why
we'd not noticed it before. But it is possible to reach the problem
with fairly simple, if contrived, regexps.
Oversight in my commit
|
4 years ago |
|
|
78a843f119 |
Improve regex compiler's arc moving/copying logic.
The functions moveins(), copyins(), moveouts(), copyouts() are
required to preserve the invariant that there are no duplicate arcs in
the regex's NFA. Spencer's original implementation of them was O(N^2)
since it checked separately for a match to each source arc. In commit
|
4 years ago |
|
|
0e6aa8747d |
Avoid determining regexp subexpression matches, when possible.
Identifying the precise match locations for parenthesized subexpressions is a fairly expensive task given the way our regexp engine works, both at regexp compile time (where we must create an optimized NFA for each parenthesized subexpression) and at runtime (where determining exact match locations requires laborious search). Up to now we've made little attempt to optimize this situation. This patch identifies cases where we know at compile time that we won't need to know subexpression match locations, and teaches the regexp compiler to not bother creating per-subexpression regexps for parenthesis pairs that are not referenced by backrefs elsewhere in the regexp. (To preserve semantics, we obviously still have to pin down the match locations of backref references.) Users could have obtained the same results before this by being careful to write "non capturing" parentheses wherever possible, but few people bother with that. Discussion: https://postgr.es/m/2219936.1628115334@sss.pgh.pa.us |
4 years ago |
|
|
00116dee5a |
Rethink regexp engine's backref-related compilation state.
I had committer's remorse almost immediately after pushing |
4 years ago |
|
|
cb76fbd7ec |
Make regexp engine's backref-related compilation state more bulletproof.
Up to now, we remembered the definition of a capturing parenthesis
subexpression by storing a pointer to the associated subRE node.
That was okay before, because that subRE didn't get modified anymore
while parsing the rest of the regexp. However, in the wake of
commit
|
4 years ago |
|
|
cc1868799c |
Fix use-after-free issue in regexp engine.
Commit |
4 years ago |
|
|
f68970e33f |
Fix performance issue in new regex match-all detection code.
Commit
|
5 years ago |
|
|
6c20bdb2a2 |
Further tweak memory management for regex DFAs.
Coverity is still unhappy after commit
|
5 years ago |
|
|
4604f83fdf |
Suppress unnecessary regex subre nodes in a couple more cases.
This extends the changes made in commit
|
5 years ago |
|
|
0c3405cf11 |
Improve performance of regular expression back-references.
In some cases, at the time that we're doing an NFA-based precheck
of whether a backref subexpression can match at a particular place
in the string, we already know which substring the referenced
subexpression matched. If so, we might as well forget about the NFA
and just compare the substring; this is faster and it gives an exact
rather than approximate answer.
In general, this optimization can help while we are prechecking within
the second child expression of a concat node, while the capture was
within the first child expression; then the substring was saved during
cdissect() of the first child and will be available to NFA checks done
while cdissect() recurses into the second child. It can help quite a
lot if the tree looks like
concat
/ \
capture concat
/ \
expensive stuff backref
as we will be able to avoid recursively dissecting the "expensive
stuff" before discovering that the backref isn't satisfied with a
particular midpoint that the lower concat node is testing. This
doesn't help if the concat tree is left-deep, as the capture node
won't get set soon enough (and it's hard to fix that without changing
the engine's match behavior). Fortunately, right-deep concat trees
are the common case.
Patch by me, reviewed by Joel Jacobson
Discussion: https://postgr.es/m/661609.1614560029@sss.pgh.pa.us
|
5 years ago |
|
|
4aea704a5b |
Fix semantics of regular expression back-references.
POSIX defines the behavior of back-references thus:
The back-reference expression '\n' shall match the same (possibly
empty) string of characters as was matched by a subexpression
enclosed between "\(" and "\)" preceding the '\n'.
As far as I can see, the back-reference is supposed to consider only
the data characters matched by the referenced subexpression. However,
because our engine copies the NFA constructed from the referenced
subexpression, it effectively enforces any constraints therein, too.
As an example, '(^.)\1' ought to match 'xx', or any other string
starting with two occurrences of the same character; but in our code
it does not, and indeed can't match anything, because the '^' anchor
constraint is included in the backref's copied NFA. If POSIX intended
that, you'd think they'd mention it. Perl for one doesn't act that
way, so it's hard to conclude that this isn't a bug.
Fix by modifying the backref's NFA immediately after it's copied from
the reference, replacing all constraint arcs by EMPTY arcs so that the
constraints are treated as automatically satisfied. This still allows
us to enforce matching rules that depend only on the data characters;
for example, in '(^\d+).*\1' the NFA matching step will still know
that the backref can only match strings of digits.
Perhaps surprisingly, this change does not affect the results of any
of a rather large corpus of real-world regexes. Nonetheless, I would
not consider back-patching it, since it's a clear compatibility break.
Patch by me, reviewed by Joel Jacobson
Discussion: https://postgr.es/m/661609.1614560029@sss.pgh.pa.us
|
5 years ago |
|
|
0fc1af174c |
Improve memory management in regex compiler.
The previous logic here created a separate pool of arcs for each state, so that the out-arcs of each state were physically stored within it. Perhaps this choice was driven by trying to not include a "from" pointer within each arc; but Spencer gave up on that idea long ago, and it's hard to see what the value is now. The approach turns out to be fairly disastrous in terms of memory consumption, though. In the first place, NFAs built by this engine seem to have about 4 arcs per state on average, with a majority having only one or two out-arcs. So pre-allocating 10 out-arcs for each state is already cause for a factor of two or more bloat. Worse, the NFA optimization phase moves arcs around with abandon. In a large NFA, some of the states will have hundreds of out-arcs, so towards the end of the optimization phase we have a significant number of states whose arc pools have room for hundreds of arcs each, even though only a few of those arcs are in use. We have seen real-world regexes in which this effect bloats the memory requirement by 25X or even more. Hence, get rid of the per-state arc pools in favor of a single arc pool for the whole NFA, with variable-sized allocation batches instead of always asking for 10 at a time. While we're at it, let's batch the allocations of state structs too, to further reduce the malloc traffic. This incidentally allows moveouts() to be optimized in a similar way to moveins(): when moving an arc to another state, it's now valid to just re-link the same arc struct into a different outchain, where before the code invariants required us to make a physically new arc and then free the old one. These changes reduce the regex compiler's typical space consumption for average-size regexes by about a factor of two, and much more for large or complicated regexes. In a large test set of real-world regexes, we formerly had half a dozen cases that failed with "regular expression too complex" due to exceeding the REG_MAX_COMPILE_SPACE limit (about 150MB); we would have had to raise that limit to something close to 400MB to make them work with the old code. Now, none of those cases need more than 13MB to compile. Furthermore, the test set is about 10% faster overall due to less malloc traffic. Discussion: https://postgr.es/m/168861.1614298592@sss.pgh.pa.us |
5 years ago |
|
|
301ed8812e |
Doc: remove src/backend/regex/re_syntax.n.
We aren't publishing this file as documentation, and it's been
much more haphazardly maintained than the real docs in func.sgml,
so let's just drop it. I think the only reason I included it in
commit
|
5 years ago |
|
|
7dc13a0f08 |
Change regex \D and \W shorthands to always match newlines.
Newline is certainly not a digit, nor a word character, so it is
sensible that it should match these complemented character classes.
Previously, \D and \W acted that way by default, but in
newline-sensitive mode ('n' or 'p' flag) they did not match newlines.
This behavior was previously forced because explicit complemented
character classes don't match newlines in newline-sensitive mode;
but as of the previous commit that implementation constraint no
longer exists. It seems useful to change this because the primary
real-world use for newline-sensitive mode seems to be to match the
default behavior of other regex engines such as Perl and Javascript
... and their default behavior is that these match newlines.
The old behavior can be kept by writing an explicit complemented
character class, i.e. [^[:digit:]] or [^[:word:]]. (This means
that \D and \W are not exactly equivalent to those strings, but
they weren't anyway.)
Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us
|
5 years ago |
|
|
2a0af7fe46 |
Allow complemented character class escapes within regex brackets.
The complement-class escapes \D, \S, \W are now allowed within bracket expressions. There is no semantic difficulty with doing that, but the rather hokey macro-expansion-based implementation previously used here couldn't cope. Also, invent "word" as an allowed character class name, thus "\w" is now equivalent to "[[:word:]]" outside brackets, or "[:word:]" within brackets. POSIX allows such implementation-specific extensions, and the same name is used in e.g. bash. One surprising compatibility issue this raises is that constructs such as "[\w-_]" are now disallowed, as our documentation has always said they should be: character classes can't be endpoints of a range. Previously, because \w was just a macro for "[:alnum:]_", such a construct was read as "[[:alnum:]_-_]", so it was accepted so long as the character after "-" was numerically greater than or equal to "_". Some implementation cleanup along the way: * Remove the lexnest() hack, and in consequence clean up wordchrs() to not interact with the lexer. * Fix colorcomplement() to not be O(N^2) in the number of colors involved. * Get rid of useless-as-far-as-I-can-see calls of element() on single-character character element names in brackpart(). element() always maps these to the character itself, and things would be quite broken if it didn't --- should "[a]" match something different than "a" does? Besides, the shortcut path in brackpart() wasn't doing this anyway, making it even more inconsistent. Discussion: https://postgr.es/m/2845172.1613674385@sss.pgh.pa.us Discussion: https://postgr.es/m/3220564.1613859619@sss.pgh.pa.us |
5 years ago |
|
|
3db05e76f9 |
Suppress compiler warning in new regex match-all detection code.
gcc 10 is smart enough to notice that control could reach this "hasmatch[depth]" assignment with depth < 0, but not smart enough to know that that would require a badly broken NFA graph. Change the assert() to a plain runtime test to shut it up. Per report from Andres Freund. Discussion: https://postgr.es/m/20210223173437.b3ywijygsy6q42gq@alap3.anarazel.de |
5 years ago |
|
|
190c79884a |
Simplify memory management for regex DFAs a little.
Coverity complained that functions in regexec.c might leak DFA storage. It's wrong, but this logic is confusing enough that it's not so surprising Coverity couldn't make sense of it. Rewrite in hopes of making it more legible to humans as well as machines. |
5 years ago |
|
|
ea1268f630 |
Avoid generating extra subre tree nodes for capturing parentheses.
Previously, each pair of capturing parentheses gave rise to a separate subre tree node, whose only function was to identify that we ought to capture the match details for this particular sub-expression. In most cases we don't really need that, since we can perfectly well put a "capture this" annotation on the child node that does the real matching work. As with the two preceding commits, the main value of this is to avoid generating and optimizing an NFA for a tree node that's not really pulling its weight. The chosen data representation only allows one capture annotation per subre node. In the legal-per-spec, but seemingly not very useful, case where there are multiple capturing parens around the exact same bit of the regex (i.e. "((xyz))"), wrap the child node in N-1 capture nodes that act the same as before. We could work harder at that but I'll refrain, pending some evidence that such cases are worth troubling over. In passing, improve the comments in regex.h to say what all the different re_info bits mean. Some of them were pretty obvious but others not so much, so reverse-engineer some documentation. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us |
5 years ago |
|
|
5810430894 |
Convert regex engine's subre tree from binary to N-ary style.
Instead of having left and right child links in subre structs, have a single child link plus a sibling link. Multiple children of a tree node are now reached by chasing the sibling chain. The beneficiary of this is alternation tree nodes. A regular expression with N (>1) branches is now represented by one alternation node with N children, rather than a tree that includes N alternation nodes as well as N children. While the old representation didn't really cost anything extra at execution time, it was pretty horrid for compilation purposes, because each of the alternation nodes had its own NFA, which we were too stupid not to separately optimize. (To make matters worse, all of those NFAs described the entire alternation pattern, not just the portion of it that one might expect from the tree structure.) We continue to require concatenation nodes to have exactly two children. This data structure is now prepared to support more, but the executor's logic would need some careful redesign, and it's not clear that a lot of benefit could be had. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us |
5 years ago |
|
|
cebc1d34e5 |
Fix regex engine to suppress useless concatenation sub-REs.
The comment for parsebranch() claims that it avoids generating unnecessary concatenation nodes in the "subre" tree, but it missed some significant cases. Once we've decided that a given atom is "messy" and can't be bundled with the preceding atom(s) of the current regex branch, parseqatom() always generated two new concat nodes, one to concat the messy atom to what follows it in the branch, and an upper node to concatenate the preceding part of the branch to that one. But one or both of these could be unnecessary, if the messy atom is the first, last, or only one in the branch. Improve the code to suppress such useless concat nodes, along with the no-op child nodes representing empty chunks of a branch. Reducing the number of subre tree nodes offers significant savings not only at execution but during compilation, because each subre node has its own NFA that has to be separately optimized. (Maybe someday we'll figure out how to share the optimization work across multiple tree nodes, but it doesn't look easy.) Eliminating upper tree nodes is especially useful because they tend to have larger NFAs. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us |
5 years ago |
|
|
824bf71902 |
Recognize "match-all" NFAs within the regex engine.
This builds on the previous "rainbow" patch to detect NFAs that will
match any string, though possibly with constraints on the string length.
This definition is chosen to match constructs such as ".*", ".+", and
".{1,100}". Recognizing such an NFA after the optimization pass is
fairly cheap, since we basically just have to verify that all arcs
are RAINBOW arcs and count the number of steps to the end state.
(Well, there's a bit of complication with pseudo-color arcs for string
boundary conditions, but not much.)
Once we have these markings, the regex executor functions longest(),
shortest(), and matchuntil() don't have to expend per-character work
to determine whether a given substring satisfies such an NFA; they
just need to check its length against the bounds. Since some matching
problems require O(N) invocations of these functions, we've reduced
the runtime for an N-character string from O(N^2) to O(N). Of course,
this is no help for non-matchall sub-patterns, but those usually have
constraints that allow us to avoid needing O(N) substring checks in the
first place. It's precisely the unconstrained "match-all" cases that
cause the most headaches.
This is part of a patch series that in total reduces the regex engine's
runtime by about a factor of four on a large corpus of real-world regexes.
Patch by me, reviewed by Joel Jacobson
Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
|
5 years ago |
|
|
08c0d6ad65 |
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anything metacharacter, produce a sheaf of parallel NFA arcs covering all possible colors (that is, character equivalence classes). We can make a noticeable improvement in the space and time needed to process large regexes by replacing such cases with a single arc bearing the special color code "RAINBOW". This requires only minor additional complication in places such as pull() and push(). Callers of pg_reg_getoutarcs() must now be prepared for the possibility of seeing a RAINBOW arc. For the one known user, contrib/pg_trgm, that's a net benefit since it cuts the number of arcs to be dealt with, and the handling isn't any different than for other colors that contain too many characters to be dealt with individually. This is part of a patch series that in total reduces the regex engine's runtime by about a factor of four on a large corpus of real-world regexes. Patch by me, reviewed by Joel Jacobson Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us |
5 years ago |
|
|
b5a66e7353 |
Fix another ancient bug in parsing of BRE-mode regular expressions.
While poking at the regex code, I happened to notice that the bug
squashed in commit
|
5 years ago |
|
|
4e703d6719 |
Make some minor improvements in the regex code.
Push some hopefully-uncontroversial bits extracted from an upcoming patch series, to remove non-relevant clutter from the main patches. In compact(), return immediately after setting REG_ASSERT error; continuing the loop would just lead to assertion failure below. (Ask me how I know.) In parseqatom(), remove assertion that moresubs() did its job. When moresubs actually did its job, this is redundant with that function's final assert; but when it failed on OOM, this is an assertion crash. We could avoid the crash by adding a NOERR() check before the assertion, but it seems better to subtract code than add it. (Note that there's a NOERR exit a few lines further down, and nothing else between here and there requires moresubs to have succeeded. So we don't really need an extra error exit.) This is a live bug in assert-enabled builds, but given the very low likelihood of OOM in moresub's tiny allocation, I don't think it's worth back-patching. On the other hand, it seems worthwhile to add an assertion that our intended v->subs[subno] target is still null by the time we are ready to insert into it, since there's a recursion in between. In pg_regexec, ensure we fflush any debug output on the way out, and try to make MDEBUG messages more uniform and helpful. (In particular, ensure that all of them are prefixed with the subre's id number, so one can match up entry and exit reports.) Add some test cases in test_regex to improve coverage of lookahead and lookbehind constraints. Adding these now is mainly to establish that this is indeed the existing behavior. Discussion: https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us |
5 years ago |
|
|
2dd6733108 |
Minor fixes to improve regex debugging code.
When REG_DEBUG is defined, ensure that an un-filled "struct cnfa" is all-zeroes, not just that it has nstates == 0. This is mainly so that looking at "struct subre" structs in gdb doesn't distract one with a lot of garbage fields during regex compilation. Adjust some places that print debug output to have suitable fflush calls afterwards. In passing, correct an erroneous ancient comment: the concatenation subre-s created by parsebranch() have op == '.' not ','. Noted while fooling around with some regex performance improvements. |
5 years ago |
|
|
afcc8772ed |
Fix ancient bug in parsing of BRE-mode regular expressions.
brenext(), when parsing a '*' quantifier, forgot to return any "value" for the token; per the equivalent case in next(), it should return value 1 to indicate that greedy rather than non-greedy behavior is wanted. The result is that the compiled regexp could behave like 'x*?' rather than the intended 'x*', if we were unlucky enough to have a zero in v->nextvalue at this point. That seems to happen with some reliability if we have '.*' at the beginning of a BRE-mode regexp, although that depends on the initial contents of a stack-allocated struct, so it's not guaranteed to fail. Found by Alexander Lakhin using valgrind testing. This bug seems to be aboriginal in Spencer's code, so back-patch all the way. Discussion: https://postgr.es/m/16814-6c5e3edd2bdf0d50@postgresql.org |
5 years ago |
|
|
ca3b37487b |
Update copyright for 2021
Backpatch-through: 9.5 |
5 years ago |
|
|
17cc133f01
|
Dial back -Wimplicit-fallthrough to level 3
The additional pain from level 4 is excessive for the gain. Also revert all the source annotation changes to their original wordings, to avoid back-patching pain. Discussion: https://postgr.es/m/31166.1589378554@sss.pgh.pa.us |
6 years ago |
|
|
3e9744465d
|
Add -Wimplicit-fallthrough to CFLAGS and CXXFLAGS
Use it at level 4, a bit more restrictive than the default level, and tweak our commanding comments to FALLTHROUGH. (However, leave zic.c alone, since it's external code; to avoid the warnings that would appear there, change CFLAGS for that file in the Makefile.) Author: Julien Rouhaud <rjuju123@gmail.com> Author: Álvaro Herrera <alvherre@alvh.no-ip.org> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Discussion: https://postgr.es/m/20200412081825.qyo5vwwco3fv4gdo@nol Discussion: https://postgr.es/m/flat/E1fDenm-0000C8-IJ@gemulon.postgresql.org |
6 years ago |
|
|
7559d8ebfa |
Update copyrights for 2020
Backpatch-through: update all files in master, backpatch legal files through 9.4 |
6 years ago |