From 5fa0ca56bf82086755f97c2586eb6b615b96d946 Mon Sep 17 00:00:00 2001 From: Jon Stovell Date: Wed, 24 Jul 2024 18:47:04 -0600 Subject: [PATCH] Improves fallback code in Utf8String::extractWords() Signed-off-by: Jon Stovell --- Sources/Tasks/UpdateUnicode.php | 60 +- Sources/Unicode/QuickCheck.php | 2 +- Sources/Unicode/RegularExpressions.php | 1379 +++++++++++++++++++++++- Sources/Unicode/Utf8String.php | 297 ++++- 4 files changed, 1678 insertions(+), 60 deletions(-) diff --git a/Sources/Tasks/UpdateUnicode.php b/Sources/Tasks/UpdateUnicode.php index 013fc9b829..fbbf072e5f 100644 --- a/Sources/Tasks/UpdateUnicode.php +++ b/Sources/Tasks/UpdateUnicode.php @@ -246,27 +246,40 @@ class UpdateUnicode extends BackgroundTask 'PropList.txt', 'emoji/emoji-data.txt', 'extracted/DerivedGeneralCategory.txt', + 'auxiliary/WordBreakProperty.txt', ], 'props' => [ + 'ALetter', 'Bidi_Control', 'Case_Ignorable', 'Cn', 'Default_Ignorable_Code_Point', 'Emoji', 'Emoji_Modifier', + 'Extend', + 'ExtendNumLet', + 'Format', + 'Hebrew_Letter', 'Ideographic', 'Join_Control', + 'Katakana', + 'MidLetter', + 'MidNum', + 'MidNumLet', + 'Numeric', 'Regional_Indicator', 'Variation_Selector', + 'WSegSpace', ], 'desc' => [ 'Helper function for utf8_sanitize_invisibles and utf8_convert_case.', '', 'Character class lists compiled from:', - 'https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt', - 'https://unicode.org/Public/UNIDATA/PropList.txt', - 'https://unicode.org/Public/UNIDATA/emoji/emoji-data.txt', - 'https://unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt', + self::DATA_URL_UCD . '/DerivedCoreProperties.txt', + self::DATA_URL_UCD . '/PropList.txt', + self::DATA_URL_UCD . '/emoji/emoji-data.txt', + self::DATA_URL_UCD . '/extracted/DerivedGeneralCategory.txt', + self::DATA_URL_UCD . '/auxiliary/WordBreakProperty.txt', ], 'return' => [ 'type' => 'array', @@ -282,8 +295,8 @@ class UpdateUnicode extends BackgroundTask 'Helper function for utf8_sanitize_invisibles.', '', 'Character class lists compiled from:', - 'https://unicode.org/Public/UNIDATA/StandardizedVariants.txt', - 'https://unicode.org/Public/UNIDATA/emoji/emoji-variation-sequences.txt', + self::DATA_URL_UCD . '/StandardizedVariants.txt', + self::DATA_URL_UCD . '/emoji/emoji-variation-sequences.txt', ], 'return' => [ 'type' => 'array', @@ -299,7 +312,7 @@ class UpdateUnicode extends BackgroundTask 'Helper function for utf8_sanitize_invisibles.', '', 'Character class lists compiled from:', - 'https://unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt', + self::DATA_URL_UCD . '/extracted/DerivedJoiningType.txt', ], 'return' => [ 'type' => 'array', @@ -315,8 +328,8 @@ class UpdateUnicode extends BackgroundTask 'Helper function for utf8_sanitize_invisibles.', '', 'Character class lists compiled from:', - 'https://unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt', - 'https://unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt', + self::DATA_URL_UCD . '/extracted/DerivedCombiningClass.txt', + self::DATA_URL_UCD . '/IndicSyllabicCategory.txt', ], 'return' => [ 'type' => 'array', @@ -332,7 +345,7 @@ class UpdateUnicode extends BackgroundTask 'Helper function for utf8_is_normalized.', '', 'Character class lists compiled from:', - 'https://unicode.org/Public/UNIDATA/extracted/DerivedNormalizationProps.txt', + self::DATA_URL_UCD . '/extracted/DerivedNormalizationProps.txt', ], 'return' => [ 'type' => 'array', @@ -517,6 +530,7 @@ class UpdateUnicode extends BackgroundTask 'emoji/emoji-variation-sequences.txt', 'extracted/DerivedGeneralCategory.txt', 'extracted/DerivedJoiningType.txt', + 'auxiliary/WordBreakProperty.txt', ], self::DATA_URL_IDNA => [ 'IdnaMappingTable.txt', @@ -1479,25 +1493,25 @@ private function build_regex_properties(): bool } $this->funcs['utf8_regex_properties']['data'][$fields[1]][] = '\\x{' . str_replace('..', '}-\\x{', $fields[0]) . '}'; - } - // We also track 'Default_Ignorable_Code_Point' property in a separate array. - if ($fields[1] !== 'Default_Ignorable_Code_Point') { - continue; + $this->funcs['utf8_regex_properties']['data'][$fields[1]] = array_unique($this->funcs['utf8_regex_properties']['data'][$fields[1]]); } - if (!str_contains($fields[0], '..')) { - $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';'; - } else { - list($start, $end) = explode('..', $fields[0]); + // We also track 'Default_Ignorable_Code_Point' property in a separate array. + if ($fields[1] === 'Default_Ignorable_Code_Point') { + if (!str_contains($fields[0], '..')) { + $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . $fields[0] . ';'; + } else { + list($start, $end) = explode('..', $fields[0]); - $ord_s = hexdec($start); - $ord_e = hexdec($end); + $ord_s = hexdec($start); + $ord_e = hexdec($end); - $ord = $ord_s; + $ord = $ord_s; - while ($ord <= $ord_e) { - $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; + while ($ord <= $ord_e) { + $this->funcs['utf8_default_ignorables']['data'][] = '&#x' . strtoupper(sprintf('%04s', dechex($ord++))) . ';'; + } } } } diff --git a/Sources/Unicode/QuickCheck.php b/Sources/Unicode/QuickCheck.php index fa5af04551..09dfbc5804 100644 --- a/Sources/Unicode/QuickCheck.php +++ b/Sources/Unicode/QuickCheck.php @@ -23,7 +23,7 @@ * Helper function for utf8_is_normalized. * * Character class lists compiled from: - * https://unicode.org/Public/UNIDATA/extracted/DerivedNormalizationProps.txt + * https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedNormalizationProps.txt * * Developers: Do not update the data in this function manually. Instead, * run "php -f other/update_unicode_data.php" on the command line. diff --git a/Sources/Unicode/RegularExpressions.php b/Sources/Unicode/RegularExpressions.php index 8537aed9e0..7018bc99b6 100644 --- a/Sources/Unicode/RegularExpressions.php +++ b/Sources/Unicode/RegularExpressions.php @@ -23,10 +23,11 @@ * Helper function for utf8_sanitize_invisibles and utf8_convert_case. * * Character class lists compiled from: - * https://unicode.org/Public/UNIDATA/DerivedCoreProperties.txt - * https://unicode.org/Public/UNIDATA/PropList.txt - * https://unicode.org/Public/UNIDATA/emoji/emoji-data.txt - * https://unicode.org/Public/UNIDATA/extracted/DerivedGeneralCategory.txt + * https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + * https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt + * https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt + * https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedGeneralCategory.txt + * https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/WordBreakProperty.txt * * Developers: Do not update the data in this function manually. Instead, * run "php -f other/update_unicode_data.php" on the command line. @@ -36,6 +37,663 @@ function utf8_regex_properties(): array { return [ + 'ALetter' => + '\\x{0041}-\\x{005A}' . + '\\x{0061}-\\x{007A}' . + '\\x{00AA}' . + '\\x{00B5}' . + '\\x{00BA}' . + '\\x{00C0}-\\x{00D6}' . + '\\x{00D8}-\\x{00F6}' . + '\\x{00F8}-\\x{01BA}' . + '\\x{01BB}' . + '\\x{01BC}-\\x{01BF}' . + '\\x{01C0}-\\x{01C3}' . + '\\x{01C4}-\\x{0293}' . + '\\x{0294}' . + '\\x{0295}-\\x{02AF}' . + '\\x{02B0}-\\x{02C1}' . + '\\x{02C2}-\\x{02C5}' . + '\\x{02C6}-\\x{02D1}' . + '\\x{02D2}-\\x{02D7}' . + '\\x{02DE}-\\x{02DF}' . + '\\x{02E0}-\\x{02E4}' . + '\\x{02E5}-\\x{02EB}' . + '\\x{02EC}' . + '\\x{02ED}' . + '\\x{02EE}' . + '\\x{02EF}-\\x{02FF}' . + '\\x{0370}-\\x{0373}' . + '\\x{0374}' . + '\\x{0376}-\\x{0377}' . + '\\x{037A}' . + '\\x{037B}-\\x{037D}' . + '\\x{037F}' . + '\\x{0386}' . + '\\x{0388}-\\x{038A}' . + '\\x{038C}' . + '\\x{038E}-\\x{03A1}' . + '\\x{03A3}-\\x{03F5}' . + '\\x{03F7}-\\x{0481}' . + '\\x{048A}-\\x{052F}' . + '\\x{0531}-\\x{0556}' . + '\\x{0559}' . + '\\x{055A}-\\x{055C}' . + '\\x{055E}' . + '\\x{0560}-\\x{0588}' . + '\\x{058A}' . + '\\x{05F3}' . + '\\x{0620}-\\x{063F}' . + '\\x{0640}' . + '\\x{0641}-\\x{064A}' . + '\\x{066E}-\\x{066F}' . + '\\x{0671}-\\x{06D3}' . + '\\x{06D5}' . + '\\x{06E5}-\\x{06E6}' . + '\\x{06EE}-\\x{06EF}' . + '\\x{06FA}-\\x{06FC}' . + '\\x{06FF}' . + '\\x{070F}' . + '\\x{0710}' . + '\\x{0712}-\\x{072F}' . + '\\x{074D}-\\x{07A5}' . + '\\x{07B1}' . + '\\x{07CA}-\\x{07EA}' . + '\\x{07F4}-\\x{07F5}' . + '\\x{07FA}' . + '\\x{0800}-\\x{0815}' . + '\\x{081A}' . + '\\x{0824}' . + '\\x{0828}' . + '\\x{0840}-\\x{0858}' . + '\\x{0860}-\\x{086A}' . + '\\x{0870}-\\x{0887}' . + '\\x{0889}-\\x{088E}' . + '\\x{08A0}-\\x{08C8}' . + '\\x{08C9}' . + '\\x{0904}-\\x{0939}' . + '\\x{093D}' . + '\\x{0950}' . + '\\x{0958}-\\x{0961}' . + '\\x{0971}' . + '\\x{0972}-\\x{0980}' . + '\\x{0985}-\\x{098C}' . + '\\x{098F}-\\x{0990}' . + '\\x{0993}-\\x{09A8}' . + '\\x{09AA}-\\x{09B0}' . + '\\x{09B2}' . + '\\x{09B6}-\\x{09B9}' . + '\\x{09BD}' . + '\\x{09CE}' . + '\\x{09DC}-\\x{09DD}' . + '\\x{09DF}-\\x{09E1}' . + '\\x{09F0}-\\x{09F1}' . + '\\x{09FC}' . + '\\x{0A05}-\\x{0A0A}' . + '\\x{0A0F}-\\x{0A10}' . + '\\x{0A13}-\\x{0A28}' . + '\\x{0A2A}-\\x{0A30}' . + '\\x{0A32}-\\x{0A33}' . + '\\x{0A35}-\\x{0A36}' . + '\\x{0A38}-\\x{0A39}' . + '\\x{0A59}-\\x{0A5C}' . + '\\x{0A5E}' . + '\\x{0A72}-\\x{0A74}' . + '\\x{0A85}-\\x{0A8D}' . + '\\x{0A8F}-\\x{0A91}' . + '\\x{0A93}-\\x{0AA8}' . + '\\x{0AAA}-\\x{0AB0}' . + '\\x{0AB2}-\\x{0AB3}' . + '\\x{0AB5}-\\x{0AB9}' . + '\\x{0ABD}' . + '\\x{0AD0}' . + '\\x{0AE0}-\\x{0AE1}' . + '\\x{0AF9}' . + '\\x{0B05}-\\x{0B0C}' . + '\\x{0B0F}-\\x{0B10}' . + '\\x{0B13}-\\x{0B28}' . + '\\x{0B2A}-\\x{0B30}' . + '\\x{0B32}-\\x{0B33}' . + '\\x{0B35}-\\x{0B39}' . + '\\x{0B3D}' . + '\\x{0B5C}-\\x{0B5D}' . + '\\x{0B5F}-\\x{0B61}' . + '\\x{0B71}' . + '\\x{0B83}' . + '\\x{0B85}-\\x{0B8A}' . + '\\x{0B8E}-\\x{0B90}' . + '\\x{0B92}-\\x{0B95}' . + '\\x{0B99}-\\x{0B9A}' . + '\\x{0B9C}' . + '\\x{0B9E}-\\x{0B9F}' . + '\\x{0BA3}-\\x{0BA4}' . + '\\x{0BA8}-\\x{0BAA}' . + '\\x{0BAE}-\\x{0BB9}' . + '\\x{0BD0}' . + '\\x{0C05}-\\x{0C0C}' . + '\\x{0C0E}-\\x{0C10}' . + '\\x{0C12}-\\x{0C28}' . + '\\x{0C2A}-\\x{0C39}' . + '\\x{0C3D}' . + '\\x{0C58}-\\x{0C5A}' . + '\\x{0C5D}' . + '\\x{0C60}-\\x{0C61}' . + '\\x{0C80}' . + '\\x{0C85}-\\x{0C8C}' . + '\\x{0C8E}-\\x{0C90}' . + '\\x{0C92}-\\x{0CA8}' . + '\\x{0CAA}-\\x{0CB3}' . + '\\x{0CB5}-\\x{0CB9}' . + '\\x{0CBD}' . + '\\x{0CDD}-\\x{0CDE}' . + '\\x{0CE0}-\\x{0CE1}' . + '\\x{0CF1}-\\x{0CF2}' . + '\\x{0D04}-\\x{0D0C}' . + '\\x{0D0E}-\\x{0D10}' . + '\\x{0D12}-\\x{0D3A}' . + '\\x{0D3D}' . + '\\x{0D4E}' . + '\\x{0D54}-\\x{0D56}' . + '\\x{0D5F}-\\x{0D61}' . + '\\x{0D7A}-\\x{0D7F}' . + '\\x{0D85}-\\x{0D96}' . + '\\x{0D9A}-\\x{0DB1}' . + '\\x{0DB3}-\\x{0DBB}' . + '\\x{0DBD}' . + '\\x{0DC0}-\\x{0DC6}' . + '\\x{0F00}' . + '\\x{0F40}-\\x{0F47}' . + '\\x{0F49}-\\x{0F6C}' . + '\\x{0F88}-\\x{0F8C}' . + '\\x{10A0}-\\x{10C5}' . + '\\x{10C7}' . + '\\x{10CD}' . + '\\x{10D0}-\\x{10FA}' . + '\\x{10FC}' . + '\\x{10FD}-\\x{10FF}' . + '\\x{1100}-\\x{1248}' . + '\\x{124A}-\\x{124D}' . + '\\x{1250}-\\x{1256}' . + '\\x{1258}' . + '\\x{125A}-\\x{125D}' . + '\\x{1260}-\\x{1288}' . + '\\x{128A}-\\x{128D}' . + '\\x{1290}-\\x{12B0}' . + '\\x{12B2}-\\x{12B5}' . + '\\x{12B8}-\\x{12BE}' . + '\\x{12C0}' . + '\\x{12C2}-\\x{12C5}' . + '\\x{12C8}-\\x{12D6}' . + '\\x{12D8}-\\x{1310}' . + '\\x{1312}-\\x{1315}' . + '\\x{1318}-\\x{135A}' . + '\\x{1380}-\\x{138F}' . + '\\x{13A0}-\\x{13F5}' . + '\\x{13F8}-\\x{13FD}' . + '\\x{1401}-\\x{166C}' . + '\\x{166F}-\\x{167F}' . + '\\x{1681}-\\x{169A}' . + '\\x{16A0}-\\x{16EA}' . + '\\x{16EE}-\\x{16F0}' . + '\\x{16F1}-\\x{16F8}' . + '\\x{1700}-\\x{1711}' . + '\\x{171F}-\\x{1731}' . + '\\x{1740}-\\x{1751}' . + '\\x{1760}-\\x{176C}' . + '\\x{176E}-\\x{1770}' . + '\\x{1820}-\\x{1842}' . + '\\x{1843}' . + '\\x{1844}-\\x{1878}' . + '\\x{1880}-\\x{1884}' . + '\\x{1887}-\\x{18A8}' . + '\\x{18AA}' . + '\\x{18B0}-\\x{18F5}' . + '\\x{1900}-\\x{191E}' . + '\\x{1A00}-\\x{1A16}' . + '\\x{1B05}-\\x{1B33}' . + '\\x{1B45}-\\x{1B4C}' . + '\\x{1B83}-\\x{1BA0}' . + '\\x{1BAE}-\\x{1BAF}' . + '\\x{1BBA}-\\x{1BE5}' . + '\\x{1C00}-\\x{1C23}' . + '\\x{1C4D}-\\x{1C4F}' . + '\\x{1C5A}-\\x{1C77}' . + '\\x{1C78}-\\x{1C7D}' . + '\\x{1C80}-\\x{1C88}' . + '\\x{1C90}-\\x{1CBA}' . + '\\x{1CBD}-\\x{1CBF}' . + '\\x{1CE9}-\\x{1CEC}' . + '\\x{1CEE}-\\x{1CF3}' . + '\\x{1CF5}-\\x{1CF6}' . + '\\x{1CFA}' . + '\\x{1D00}-\\x{1D2B}' . + '\\x{1D2C}-\\x{1D6A}' . + '\\x{1D6B}-\\x{1D77}' . + '\\x{1D78}' . + '\\x{1D79}-\\x{1D9A}' . + '\\x{1D9B}-\\x{1DBF}' . + '\\x{1E00}-\\x{1F15}' . + '\\x{1F18}-\\x{1F1D}' . + '\\x{1F20}-\\x{1F45}' . + '\\x{1F48}-\\x{1F4D}' . + '\\x{1F50}-\\x{1F57}' . + '\\x{1F59}' . + '\\x{1F5B}' . + '\\x{1F5D}' . + '\\x{1F5F}-\\x{1F7D}' . + '\\x{1F80}-\\x{1FB4}' . + '\\x{1FB6}-\\x{1FBC}' . + '\\x{1FBE}' . + '\\x{1FC2}-\\x{1FC4}' . + '\\x{1FC6}-\\x{1FCC}' . + '\\x{1FD0}-\\x{1FD3}' . + '\\x{1FD6}-\\x{1FDB}' . + '\\x{1FE0}-\\x{1FEC}' . + '\\x{1FF2}-\\x{1FF4}' . + '\\x{1FF6}-\\x{1FFC}' . + '\\x{2071}' . + '\\x{207F}' . + '\\x{2090}-\\x{209C}' . + '\\x{2102}' . + '\\x{2107}' . + '\\x{210A}-\\x{2113}' . + '\\x{2115}' . + '\\x{2119}-\\x{211D}' . + '\\x{2124}' . + '\\x{2126}' . + '\\x{2128}' . + '\\x{212A}-\\x{212D}' . + '\\x{212F}-\\x{2134}' . + '\\x{2135}-\\x{2138}' . + '\\x{2139}' . + '\\x{213C}-\\x{213F}' . + '\\x{2145}-\\x{2149}' . + '\\x{214E}' . + '\\x{2160}-\\x{2182}' . + '\\x{2183}-\\x{2184}' . + '\\x{2185}-\\x{2188}' . + '\\x{24B6}-\\x{24E9}' . + '\\x{2C00}-\\x{2C7B}' . + '\\x{2C7C}-\\x{2C7D}' . + '\\x{2C7E}-\\x{2CE4}' . + '\\x{2CEB}-\\x{2CEE}' . + '\\x{2CF2}-\\x{2CF3}' . + '\\x{2D00}-\\x{2D25}' . + '\\x{2D27}' . + '\\x{2D2D}' . + '\\x{2D30}-\\x{2D67}' . + '\\x{2D6F}' . + '\\x{2D80}-\\x{2D96}' . + '\\x{2DA0}-\\x{2DA6}' . + '\\x{2DA8}-\\x{2DAE}' . + '\\x{2DB0}-\\x{2DB6}' . + '\\x{2DB8}-\\x{2DBE}' . + '\\x{2DC0}-\\x{2DC6}' . + '\\x{2DC8}-\\x{2DCE}' . + '\\x{2DD0}-\\x{2DD6}' . + '\\x{2DD8}-\\x{2DDE}' . + '\\x{2E2F}' . + '\\x{3005}' . + '\\x{303B}' . + '\\x{303C}' . + '\\x{3105}-\\x{312F}' . + '\\x{3131}-\\x{318E}' . + '\\x{31A0}-\\x{31BF}' . + '\\x{A000}-\\x{A014}' . + '\\x{A015}' . + '\\x{A016}-\\x{A48C}' . + '\\x{A4D0}-\\x{A4F7}' . + '\\x{A4F8}-\\x{A4FD}' . + '\\x{A500}-\\x{A60B}' . + '\\x{A60C}' . + '\\x{A610}-\\x{A61F}' . + '\\x{A62A}-\\x{A62B}' . + '\\x{A640}-\\x{A66D}' . + '\\x{A66E}' . + '\\x{A67F}' . + '\\x{A680}-\\x{A69B}' . + '\\x{A69C}-\\x{A69D}' . + '\\x{A6A0}-\\x{A6E5}' . + '\\x{A6E6}-\\x{A6EF}' . + '\\x{A708}-\\x{A716}' . + '\\x{A717}-\\x{A71F}' . + '\\x{A720}-\\x{A721}' . + '\\x{A722}-\\x{A76F}' . + '\\x{A770}' . + '\\x{A771}-\\x{A787}' . + '\\x{A788}' . + '\\x{A789}-\\x{A78A}' . + '\\x{A78B}-\\x{A78E}' . + '\\x{A78F}' . + '\\x{A790}-\\x{A7CA}' . + '\\x{A7D0}-\\x{A7D1}' . + '\\x{A7D3}' . + '\\x{A7D5}-\\x{A7D9}' . + '\\x{A7F2}-\\x{A7F4}' . + '\\x{A7F5}-\\x{A7F6}' . + '\\x{A7F7}' . + '\\x{A7F8}-\\x{A7F9}' . + '\\x{A7FA}' . + '\\x{A7FB}-\\x{A801}' . + '\\x{A803}-\\x{A805}' . + '\\x{A807}-\\x{A80A}' . + '\\x{A80C}-\\x{A822}' . + '\\x{A840}-\\x{A873}' . + '\\x{A882}-\\x{A8B3}' . + '\\x{A8F2}-\\x{A8F7}' . + '\\x{A8FB}' . + '\\x{A8FD}-\\x{A8FE}' . + '\\x{A90A}-\\x{A925}' . + '\\x{A930}-\\x{A946}' . + '\\x{A960}-\\x{A97C}' . + '\\x{A984}-\\x{A9B2}' . + '\\x{A9CF}' . + '\\x{AA00}-\\x{AA28}' . + '\\x{AA40}-\\x{AA42}' . + '\\x{AA44}-\\x{AA4B}' . + '\\x{AAE0}-\\x{AAEA}' . + '\\x{AAF2}' . + '\\x{AAF3}-\\x{AAF4}' . + '\\x{AB01}-\\x{AB06}' . + '\\x{AB09}-\\x{AB0E}' . + '\\x{AB11}-\\x{AB16}' . + '\\x{AB20}-\\x{AB26}' . + '\\x{AB28}-\\x{AB2E}' . + '\\x{AB30}-\\x{AB5A}' . + '\\x{AB5B}' . + '\\x{AB5C}-\\x{AB5F}' . + '\\x{AB60}-\\x{AB68}' . + '\\x{AB69}' . + '\\x{AB70}-\\x{ABBF}' . + '\\x{ABC0}-\\x{ABE2}' . + '\\x{AC00}-\\x{D7A3}' . + '\\x{D7B0}-\\x{D7C6}' . + '\\x{D7CB}-\\x{D7FB}' . + '\\x{FB00}-\\x{FB06}' . + '\\x{FB13}-\\x{FB17}' . + '\\x{FB50}-\\x{FBB1}' . + '\\x{FBD3}-\\x{FD3D}' . + '\\x{FD50}-\\x{FD8F}' . + '\\x{FD92}-\\x{FDC7}' . + '\\x{FDF0}-\\x{FDFB}' . + '\\x{FE70}-\\x{FE74}' . + '\\x{FE76}-\\x{FEFC}' . + '\\x{FF21}-\\x{FF3A}' . + '\\x{FF41}-\\x{FF5A}' . + '\\x{FFA0}-\\x{FFBE}' . + '\\x{FFC2}-\\x{FFC7}' . + '\\x{FFCA}-\\x{FFCF}' . + '\\x{FFD2}-\\x{FFD7}' . + '\\x{FFDA}-\\x{FFDC}' . + '\\x{10000}-\\x{1000B}' . + '\\x{1000D}-\\x{10026}' . + '\\x{10028}-\\x{1003A}' . + '\\x{1003C}-\\x{1003D}' . + '\\x{1003F}-\\x{1004D}' . + '\\x{10050}-\\x{1005D}' . + '\\x{10080}-\\x{100FA}' . + '\\x{10140}-\\x{10174}' . + '\\x{10280}-\\x{1029C}' . + '\\x{102A0}-\\x{102D0}' . + '\\x{10300}-\\x{1031F}' . + '\\x{1032D}-\\x{10340}' . + '\\x{10341}' . + '\\x{10342}-\\x{10349}' . + '\\x{1034A}' . + '\\x{10350}-\\x{10375}' . + '\\x{10380}-\\x{1039D}' . + '\\x{103A0}-\\x{103C3}' . + '\\x{103C8}-\\x{103CF}' . + '\\x{103D1}-\\x{103D5}' . + '\\x{10400}-\\x{1044F}' . + '\\x{10450}-\\x{1049D}' . + '\\x{104B0}-\\x{104D3}' . + '\\x{104D8}-\\x{104FB}' . + '\\x{10500}-\\x{10527}' . + '\\x{10530}-\\x{10563}' . + '\\x{10570}-\\x{1057A}' . + '\\x{1057C}-\\x{1058A}' . + '\\x{1058C}-\\x{10592}' . + '\\x{10594}-\\x{10595}' . + '\\x{10597}-\\x{105A1}' . + '\\x{105A3}-\\x{105B1}' . + '\\x{105B3}-\\x{105B9}' . + '\\x{105BB}-\\x{105BC}' . + '\\x{10600}-\\x{10736}' . + '\\x{10740}-\\x{10755}' . + '\\x{10760}-\\x{10767}' . + '\\x{10780}-\\x{10785}' . + '\\x{10787}-\\x{107B0}' . + '\\x{107B2}-\\x{107BA}' . + '\\x{10800}-\\x{10805}' . + '\\x{10808}' . + '\\x{1080A}-\\x{10835}' . + '\\x{10837}-\\x{10838}' . + '\\x{1083C}' . + '\\x{1083F}-\\x{10855}' . + '\\x{10860}-\\x{10876}' . + '\\x{10880}-\\x{1089E}' . + '\\x{108E0}-\\x{108F2}' . + '\\x{108F4}-\\x{108F5}' . + '\\x{10900}-\\x{10915}' . + '\\x{10920}-\\x{10939}' . + '\\x{10980}-\\x{109B7}' . + '\\x{109BE}-\\x{109BF}' . + '\\x{10A00}' . + '\\x{10A10}-\\x{10A13}' . + '\\x{10A15}-\\x{10A17}' . + '\\x{10A19}-\\x{10A35}' . + '\\x{10A60}-\\x{10A7C}' . + '\\x{10A80}-\\x{10A9C}' . + '\\x{10AC0}-\\x{10AC7}' . + '\\x{10AC9}-\\x{10AE4}' . + '\\x{10B00}-\\x{10B35}' . + '\\x{10B40}-\\x{10B55}' . + '\\x{10B60}-\\x{10B72}' . + '\\x{10B80}-\\x{10B91}' . + '\\x{10C00}-\\x{10C48}' . + '\\x{10C80}-\\x{10CB2}' . + '\\x{10CC0}-\\x{10CF2}' . + '\\x{10D00}-\\x{10D23}' . + '\\x{10E80}-\\x{10EA9}' . + '\\x{10EB0}-\\x{10EB1}' . + '\\x{10F00}-\\x{10F1C}' . + '\\x{10F27}' . + '\\x{10F30}-\\x{10F45}' . + '\\x{10F70}-\\x{10F81}' . + '\\x{10FB0}-\\x{10FC4}' . + '\\x{10FE0}-\\x{10FF6}' . + '\\x{11003}-\\x{11037}' . + '\\x{11071}-\\x{11072}' . + '\\x{11075}' . + '\\x{11083}-\\x{110AF}' . + '\\x{110D0}-\\x{110E8}' . + '\\x{11103}-\\x{11126}' . + '\\x{11144}' . + '\\x{11147}' . + '\\x{11150}-\\x{11172}' . + '\\x{11176}' . + '\\x{11183}-\\x{111B2}' . + '\\x{111C1}-\\x{111C4}' . + '\\x{111DA}' . + '\\x{111DC}' . + '\\x{11200}-\\x{11211}' . + '\\x{11213}-\\x{1122B}' . + '\\x{1123F}-\\x{11240}' . + '\\x{11280}-\\x{11286}' . + '\\x{11288}' . + '\\x{1128A}-\\x{1128D}' . + '\\x{1128F}-\\x{1129D}' . + '\\x{1129F}-\\x{112A8}' . + '\\x{112B0}-\\x{112DE}' . + '\\x{11305}-\\x{1130C}' . + '\\x{1130F}-\\x{11310}' . + '\\x{11313}-\\x{11328}' . + '\\x{1132A}-\\x{11330}' . + '\\x{11332}-\\x{11333}' . + '\\x{11335}-\\x{11339}' . + '\\x{1133D}' . + '\\x{11350}' . + '\\x{1135D}-\\x{11361}' . + '\\x{11400}-\\x{11434}' . + '\\x{11447}-\\x{1144A}' . + '\\x{1145F}-\\x{11461}' . + '\\x{11480}-\\x{114AF}' . + '\\x{114C4}-\\x{114C5}' . + '\\x{114C7}' . + '\\x{11580}-\\x{115AE}' . + '\\x{115D8}-\\x{115DB}' . + '\\x{11600}-\\x{1162F}' . + '\\x{11644}' . + '\\x{11680}-\\x{116AA}' . + '\\x{116B8}' . + '\\x{11800}-\\x{1182B}' . + '\\x{118A0}-\\x{118DF}' . + '\\x{118FF}-\\x{11906}' . + '\\x{11909}' . + '\\x{1190C}-\\x{11913}' . + '\\x{11915}-\\x{11916}' . + '\\x{11918}-\\x{1192F}' . + '\\x{1193F}' . + '\\x{11941}' . + '\\x{119A0}-\\x{119A7}' . + '\\x{119AA}-\\x{119D0}' . + '\\x{119E1}' . + '\\x{119E3}' . + '\\x{11A00}' . + '\\x{11A0B}-\\x{11A32}' . + '\\x{11A3A}' . + '\\x{11A50}' . + '\\x{11A5C}-\\x{11A89}' . + '\\x{11A9D}' . + '\\x{11AB0}-\\x{11AF8}' . + '\\x{11C00}-\\x{11C08}' . + '\\x{11C0A}-\\x{11C2E}' . + '\\x{11C40}' . + '\\x{11C72}-\\x{11C8F}' . + '\\x{11D00}-\\x{11D06}' . + '\\x{11D08}-\\x{11D09}' . + '\\x{11D0B}-\\x{11D30}' . + '\\x{11D46}' . + '\\x{11D60}-\\x{11D65}' . + '\\x{11D67}-\\x{11D68}' . + '\\x{11D6A}-\\x{11D89}' . + '\\x{11D98}' . + '\\x{11EE0}-\\x{11EF2}' . + '\\x{11F02}' . + '\\x{11F04}-\\x{11F10}' . + '\\x{11F12}-\\x{11F33}' . + '\\x{11FB0}' . + '\\x{12000}-\\x{12399}' . + '\\x{12400}-\\x{1246E}' . + '\\x{12480}-\\x{12543}' . + '\\x{12F90}-\\x{12FF0}' . + '\\x{13000}-\\x{1342F}' . + '\\x{13441}-\\x{13446}' . + '\\x{14400}-\\x{14646}' . + '\\x{16800}-\\x{16A38}' . + '\\x{16A40}-\\x{16A5E}' . + '\\x{16A70}-\\x{16ABE}' . + '\\x{16AD0}-\\x{16AED}' . + '\\x{16B00}-\\x{16B2F}' . + '\\x{16B40}-\\x{16B43}' . + '\\x{16B63}-\\x{16B77}' . + '\\x{16B7D}-\\x{16B8F}' . + '\\x{16E40}-\\x{16E7F}' . + '\\x{16F00}-\\x{16F4A}' . + '\\x{16F50}' . + '\\x{16F93}-\\x{16F9F}' . + '\\x{16FE0}-\\x{16FE1}' . + '\\x{16FE3}' . + '\\x{1BC00}-\\x{1BC6A}' . + '\\x{1BC70}-\\x{1BC7C}' . + '\\x{1BC80}-\\x{1BC88}' . + '\\x{1BC90}-\\x{1BC99}' . + '\\x{1D400}-\\x{1D454}' . + '\\x{1D456}-\\x{1D49C}' . + '\\x{1D49E}-\\x{1D49F}' . + '\\x{1D4A2}' . + '\\x{1D4A5}-\\x{1D4A6}' . + '\\x{1D4A9}-\\x{1D4AC}' . + '\\x{1D4AE}-\\x{1D4B9}' . + '\\x{1D4BB}' . + '\\x{1D4BD}-\\x{1D4C3}' . + '\\x{1D4C5}-\\x{1D505}' . + '\\x{1D507}-\\x{1D50A}' . + '\\x{1D50D}-\\x{1D514}' . + '\\x{1D516}-\\x{1D51C}' . + '\\x{1D51E}-\\x{1D539}' . + '\\x{1D53B}-\\x{1D53E}' . + '\\x{1D540}-\\x{1D544}' . + '\\x{1D546}' . + '\\x{1D54A}-\\x{1D550}' . + '\\x{1D552}-\\x{1D6A5}' . + '\\x{1D6A8}-\\x{1D6C0}' . + '\\x{1D6C2}-\\x{1D6DA}' . + '\\x{1D6DC}-\\x{1D6FA}' . + '\\x{1D6FC}-\\x{1D714}' . + '\\x{1D716}-\\x{1D734}' . + '\\x{1D736}-\\x{1D74E}' . + '\\x{1D750}-\\x{1D76E}' . + '\\x{1D770}-\\x{1D788}' . + '\\x{1D78A}-\\x{1D7A8}' . + '\\x{1D7AA}-\\x{1D7C2}' . + '\\x{1D7C4}-\\x{1D7CB}' . + '\\x{1DF00}-\\x{1DF09}' . + '\\x{1DF0A}' . + '\\x{1DF0B}-\\x{1DF1E}' . + '\\x{1DF25}-\\x{1DF2A}' . + '\\x{1E030}-\\x{1E06D}' . + '\\x{1E100}-\\x{1E12C}' . + '\\x{1E137}-\\x{1E13D}' . + '\\x{1E14E}' . + '\\x{1E290}-\\x{1E2AD}' . + '\\x{1E2C0}-\\x{1E2EB}' . + '\\x{1E4D0}-\\x{1E4EA}' . + '\\x{1E4EB}' . + '\\x{1E7E0}-\\x{1E7E6}' . + '\\x{1E7E8}-\\x{1E7EB}' . + '\\x{1E7ED}-\\x{1E7EE}' . + '\\x{1E7F0}-\\x{1E7FE}' . + '\\x{1E800}-\\x{1E8C4}' . + '\\x{1E900}-\\x{1E943}' . + '\\x{1E94B}' . + '\\x{1EE00}-\\x{1EE03}' . + '\\x{1EE05}-\\x{1EE1F}' . + '\\x{1EE21}-\\x{1EE22}' . + '\\x{1EE24}' . + '\\x{1EE27}' . + '\\x{1EE29}-\\x{1EE32}' . + '\\x{1EE34}-\\x{1EE37}' . + '\\x{1EE39}' . + '\\x{1EE3B}' . + '\\x{1EE42}' . + '\\x{1EE47}' . + '\\x{1EE49}' . + '\\x{1EE4B}' . + '\\x{1EE4D}-\\x{1EE4F}' . + '\\x{1EE51}-\\x{1EE52}' . + '\\x{1EE54}' . + '\\x{1EE57}' . + '\\x{1EE59}' . + '\\x{1EE5B}' . + '\\x{1EE5D}' . + '\\x{1EE5F}' . + '\\x{1EE61}-\\x{1EE62}' . + '\\x{1EE64}' . + '\\x{1EE67}-\\x{1EE6A}' . + '\\x{1EE6C}-\\x{1EE72}' . + '\\x{1EE74}-\\x{1EE77}' . + '\\x{1EE79}-\\x{1EE7C}' . + '\\x{1EE7E}' . + '\\x{1EE80}-\\x{1EE89}' . + '\\x{1EE8B}-\\x{1EE9B}' . + '\\x{1EEA1}-\\x{1EEA3}' . + '\\x{1EEA5}-\\x{1EEA9}' . + '\\x{1EEAB}-\\x{1EEBB}' . + '\\x{1F130}-\\x{1F149}' . + '\\x{1F150}-\\x{1F169}' . + '\\x{1F170}-\\x{1F189}', 'Bidi_Control' => '\\x{061C}' . '\\x{200E}-\\x{200F}' . @@ -1676,6 +2334,577 @@ function utf8_regex_properties(): array '\\x{1FAF7}-\\x{1FAF8}', 'Emoji_Modifier' => '\\x{1F3FB}-\\x{1F3FF}', + 'Extend' => + '\\x{0300}-\\x{036F}' . + '\\x{0483}-\\x{0487}' . + '\\x{0488}-\\x{0489}' . + '\\x{0591}-\\x{05BD}' . + '\\x{05BF}' . + '\\x{05C1}-\\x{05C2}' . + '\\x{05C4}-\\x{05C5}' . + '\\x{05C7}' . + '\\x{0610}-\\x{061A}' . + '\\x{064B}-\\x{065F}' . + '\\x{0670}' . + '\\x{06D6}-\\x{06DC}' . + '\\x{06DF}-\\x{06E4}' . + '\\x{06E7}-\\x{06E8}' . + '\\x{06EA}-\\x{06ED}' . + '\\x{0711}' . + '\\x{0730}-\\x{074A}' . + '\\x{07A6}-\\x{07B0}' . + '\\x{07EB}-\\x{07F3}' . + '\\x{07FD}' . + '\\x{0816}-\\x{0819}' . + '\\x{081B}-\\x{0823}' . + '\\x{0825}-\\x{0827}' . + '\\x{0829}-\\x{082D}' . + '\\x{0859}-\\x{085B}' . + '\\x{0898}-\\x{089F}' . + '\\x{08CA}-\\x{08E1}' . + '\\x{08E3}-\\x{0902}' . + '\\x{0903}' . + '\\x{093A}' . + '\\x{093B}' . + '\\x{093C}' . + '\\x{093E}-\\x{0940}' . + '\\x{0941}-\\x{0948}' . + '\\x{0949}-\\x{094C}' . + '\\x{094D}' . + '\\x{094E}-\\x{094F}' . + '\\x{0951}-\\x{0957}' . + '\\x{0962}-\\x{0963}' . + '\\x{0981}' . + '\\x{0982}-\\x{0983}' . + '\\x{09BC}' . + '\\x{09BE}-\\x{09C0}' . + '\\x{09C1}-\\x{09C4}' . + '\\x{09C7}-\\x{09C8}' . + '\\x{09CB}-\\x{09CC}' . + '\\x{09CD}' . + '\\x{09D7}' . + '\\x{09E2}-\\x{09E3}' . + '\\x{09FE}' . + '\\x{0A01}-\\x{0A02}' . + '\\x{0A03}' . + '\\x{0A3C}' . + '\\x{0A3E}-\\x{0A40}' . + '\\x{0A41}-\\x{0A42}' . + '\\x{0A47}-\\x{0A48}' . + '\\x{0A4B}-\\x{0A4D}' . + '\\x{0A51}' . + '\\x{0A70}-\\x{0A71}' . + '\\x{0A75}' . + '\\x{0A81}-\\x{0A82}' . + '\\x{0A83}' . + '\\x{0ABC}' . + '\\x{0ABE}-\\x{0AC0}' . + '\\x{0AC1}-\\x{0AC5}' . + '\\x{0AC7}-\\x{0AC8}' . + '\\x{0AC9}' . + '\\x{0ACB}-\\x{0ACC}' . + '\\x{0ACD}' . + '\\x{0AE2}-\\x{0AE3}' . + '\\x{0AFA}-\\x{0AFF}' . + '\\x{0B01}' . + '\\x{0B02}-\\x{0B03}' . + '\\x{0B3C}' . + '\\x{0B3E}' . + '\\x{0B3F}' . + '\\x{0B40}' . + '\\x{0B41}-\\x{0B44}' . + '\\x{0B47}-\\x{0B48}' . + '\\x{0B4B}-\\x{0B4C}' . + '\\x{0B4D}' . + '\\x{0B55}-\\x{0B56}' . + '\\x{0B57}' . + '\\x{0B62}-\\x{0B63}' . + '\\x{0B82}' . + '\\x{0BBE}-\\x{0BBF}' . + '\\x{0BC0}' . + '\\x{0BC1}-\\x{0BC2}' . + '\\x{0BC6}-\\x{0BC8}' . + '\\x{0BCA}-\\x{0BCC}' . + '\\x{0BCD}' . + '\\x{0BD7}' . + '\\x{0C00}' . + '\\x{0C01}-\\x{0C03}' . + '\\x{0C04}' . + '\\x{0C3C}' . + '\\x{0C3E}-\\x{0C40}' . + '\\x{0C41}-\\x{0C44}' . + '\\x{0C46}-\\x{0C48}' . + '\\x{0C4A}-\\x{0C4D}' . + '\\x{0C55}-\\x{0C56}' . + '\\x{0C62}-\\x{0C63}' . + '\\x{0C81}' . + '\\x{0C82}-\\x{0C83}' . + '\\x{0CBC}' . + '\\x{0CBE}' . + '\\x{0CBF}' . + '\\x{0CC0}-\\x{0CC4}' . + '\\x{0CC6}' . + '\\x{0CC7}-\\x{0CC8}' . + '\\x{0CCA}-\\x{0CCB}' . + '\\x{0CCC}-\\x{0CCD}' . + '\\x{0CD5}-\\x{0CD6}' . + '\\x{0CE2}-\\x{0CE3}' . + '\\x{0CF3}' . + '\\x{0D00}-\\x{0D01}' . + '\\x{0D02}-\\x{0D03}' . + '\\x{0D3B}-\\x{0D3C}' . + '\\x{0D3E}-\\x{0D40}' . + '\\x{0D41}-\\x{0D44}' . + '\\x{0D46}-\\x{0D48}' . + '\\x{0D4A}-\\x{0D4C}' . + '\\x{0D4D}' . + '\\x{0D57}' . + '\\x{0D62}-\\x{0D63}' . + '\\x{0D81}' . + '\\x{0D82}-\\x{0D83}' . + '\\x{0DCA}' . + '\\x{0DCF}-\\x{0DD1}' . + '\\x{0DD2}-\\x{0DD4}' . + '\\x{0DD6}' . + '\\x{0DD8}-\\x{0DDF}' . + '\\x{0DF2}-\\x{0DF3}' . + '\\x{0E31}' . + '\\x{0E34}-\\x{0E3A}' . + '\\x{0E47}-\\x{0E4E}' . + '\\x{0EB1}' . + '\\x{0EB4}-\\x{0EBC}' . + '\\x{0EC8}-\\x{0ECE}' . + '\\x{0F18}-\\x{0F19}' . + '\\x{0F35}' . + '\\x{0F37}' . + '\\x{0F39}' . + '\\x{0F3E}-\\x{0F3F}' . + '\\x{0F71}-\\x{0F7E}' . + '\\x{0F7F}' . + '\\x{0F80}-\\x{0F84}' . + '\\x{0F86}-\\x{0F87}' . + '\\x{0F8D}-\\x{0F97}' . + '\\x{0F99}-\\x{0FBC}' . + '\\x{0FC6}' . + '\\x{102B}-\\x{102C}' . + '\\x{102D}-\\x{1030}' . + '\\x{1031}' . + '\\x{1032}-\\x{1037}' . + '\\x{1038}' . + '\\x{1039}-\\x{103A}' . + '\\x{103B}-\\x{103C}' . + '\\x{103D}-\\x{103E}' . + '\\x{1056}-\\x{1057}' . + '\\x{1058}-\\x{1059}' . + '\\x{105E}-\\x{1060}' . + '\\x{1062}-\\x{1064}' . + '\\x{1067}-\\x{106D}' . + '\\x{1071}-\\x{1074}' . + '\\x{1082}' . + '\\x{1083}-\\x{1084}' . + '\\x{1085}-\\x{1086}' . + '\\x{1087}-\\x{108C}' . + '\\x{108D}' . + '\\x{108F}' . + '\\x{109A}-\\x{109C}' . + '\\x{109D}' . + '\\x{135D}-\\x{135F}' . + '\\x{1712}-\\x{1714}' . + '\\x{1715}' . + '\\x{1732}-\\x{1733}' . + '\\x{1734}' . + '\\x{1752}-\\x{1753}' . + '\\x{1772}-\\x{1773}' . + '\\x{17B4}-\\x{17B5}' . + '\\x{17B6}' . + '\\x{17B7}-\\x{17BD}' . + '\\x{17BE}-\\x{17C5}' . + '\\x{17C6}' . + '\\x{17C7}-\\x{17C8}' . + '\\x{17C9}-\\x{17D3}' . + '\\x{17DD}' . + '\\x{180B}-\\x{180D}' . + '\\x{180F}' . + '\\x{1885}-\\x{1886}' . + '\\x{18A9}' . + '\\x{1920}-\\x{1922}' . + '\\x{1923}-\\x{1926}' . + '\\x{1927}-\\x{1928}' . + '\\x{1929}-\\x{192B}' . + '\\x{1930}-\\x{1931}' . + '\\x{1932}' . + '\\x{1933}-\\x{1938}' . + '\\x{1939}-\\x{193B}' . + '\\x{1A17}-\\x{1A18}' . + '\\x{1A19}-\\x{1A1A}' . + '\\x{1A1B}' . + '\\x{1A55}' . + '\\x{1A56}' . + '\\x{1A57}' . + '\\x{1A58}-\\x{1A5E}' . + '\\x{1A60}' . + '\\x{1A61}' . + '\\x{1A62}' . + '\\x{1A63}-\\x{1A64}' . + '\\x{1A65}-\\x{1A6C}' . + '\\x{1A6D}-\\x{1A72}' . + '\\x{1A73}-\\x{1A7C}' . + '\\x{1A7F}' . + '\\x{1AB0}-\\x{1ABD}' . + '\\x{1ABE}' . + '\\x{1ABF}-\\x{1ACE}' . + '\\x{1B00}-\\x{1B03}' . + '\\x{1B04}' . + '\\x{1B34}' . + '\\x{1B35}' . + '\\x{1B36}-\\x{1B3A}' . + '\\x{1B3B}' . + '\\x{1B3C}' . + '\\x{1B3D}-\\x{1B41}' . + '\\x{1B42}' . + '\\x{1B43}-\\x{1B44}' . + '\\x{1B6B}-\\x{1B73}' . + '\\x{1B80}-\\x{1B81}' . + '\\x{1B82}' . + '\\x{1BA1}' . + '\\x{1BA2}-\\x{1BA5}' . + '\\x{1BA6}-\\x{1BA7}' . + '\\x{1BA8}-\\x{1BA9}' . + '\\x{1BAA}' . + '\\x{1BAB}-\\x{1BAD}' . + '\\x{1BE6}' . + '\\x{1BE7}' . + '\\x{1BE8}-\\x{1BE9}' . + '\\x{1BEA}-\\x{1BEC}' . + '\\x{1BED}' . + '\\x{1BEE}' . + '\\x{1BEF}-\\x{1BF1}' . + '\\x{1BF2}-\\x{1BF3}' . + '\\x{1C24}-\\x{1C2B}' . + '\\x{1C2C}-\\x{1C33}' . + '\\x{1C34}-\\x{1C35}' . + '\\x{1C36}-\\x{1C37}' . + '\\x{1CD0}-\\x{1CD2}' . + '\\x{1CD4}-\\x{1CE0}' . + '\\x{1CE1}' . + '\\x{1CE2}-\\x{1CE8}' . + '\\x{1CED}' . + '\\x{1CF4}' . + '\\x{1CF7}' . + '\\x{1CF8}-\\x{1CF9}' . + '\\x{1DC0}-\\x{1DFF}' . + '\\x{200C}' . + '\\x{20D0}-\\x{20DC}' . + '\\x{20DD}-\\x{20E0}' . + '\\x{20E1}' . + '\\x{20E2}-\\x{20E4}' . + '\\x{20E5}-\\x{20F0}' . + '\\x{2CEF}-\\x{2CF1}' . + '\\x{2D7F}' . + '\\x{2DE0}-\\x{2DFF}' . + '\\x{302A}-\\x{302D}' . + '\\x{302E}-\\x{302F}' . + '\\x{3099}-\\x{309A}' . + '\\x{A66F}' . + '\\x{A670}-\\x{A672}' . + '\\x{A674}-\\x{A67D}' . + '\\x{A69E}-\\x{A69F}' . + '\\x{A6F0}-\\x{A6F1}' . + '\\x{A802}' . + '\\x{A806}' . + '\\x{A80B}' . + '\\x{A823}-\\x{A824}' . + '\\x{A825}-\\x{A826}' . + '\\x{A827}' . + '\\x{A82C}' . + '\\x{A880}-\\x{A881}' . + '\\x{A8B4}-\\x{A8C3}' . + '\\x{A8C4}-\\x{A8C5}' . + '\\x{A8E0}-\\x{A8F1}' . + '\\x{A8FF}' . + '\\x{A926}-\\x{A92D}' . + '\\x{A947}-\\x{A951}' . + '\\x{A952}-\\x{A953}' . + '\\x{A980}-\\x{A982}' . + '\\x{A983}' . + '\\x{A9B3}' . + '\\x{A9B4}-\\x{A9B5}' . + '\\x{A9B6}-\\x{A9B9}' . + '\\x{A9BA}-\\x{A9BB}' . + '\\x{A9BC}-\\x{A9BD}' . + '\\x{A9BE}-\\x{A9C0}' . + '\\x{A9E5}' . + '\\x{AA29}-\\x{AA2E}' . + '\\x{AA2F}-\\x{AA30}' . + '\\x{AA31}-\\x{AA32}' . + '\\x{AA33}-\\x{AA34}' . + '\\x{AA35}-\\x{AA36}' . + '\\x{AA43}' . + '\\x{AA4C}' . + '\\x{AA4D}' . + '\\x{AA7B}' . + '\\x{AA7C}' . + '\\x{AA7D}' . + '\\x{AAB0}' . + '\\x{AAB2}-\\x{AAB4}' . + '\\x{AAB7}-\\x{AAB8}' . + '\\x{AABE}-\\x{AABF}' . + '\\x{AAC1}' . + '\\x{AAEB}' . + '\\x{AAEC}-\\x{AAED}' . + '\\x{AAEE}-\\x{AAEF}' . + '\\x{AAF5}' . + '\\x{AAF6}' . + '\\x{ABE3}-\\x{ABE4}' . + '\\x{ABE5}' . + '\\x{ABE6}-\\x{ABE7}' . + '\\x{ABE8}' . + '\\x{ABE9}-\\x{ABEA}' . + '\\x{ABEC}' . + '\\x{ABED}' . + '\\x{FB1E}' . + '\\x{FE00}-\\x{FE0F}' . + '\\x{FE20}-\\x{FE2F}' . + '\\x{FF9E}-\\x{FF9F}' . + '\\x{101FD}' . + '\\x{102E0}' . + '\\x{10376}-\\x{1037A}' . + '\\x{10A01}-\\x{10A03}' . + '\\x{10A05}-\\x{10A06}' . + '\\x{10A0C}-\\x{10A0F}' . + '\\x{10A38}-\\x{10A3A}' . + '\\x{10A3F}' . + '\\x{10AE5}-\\x{10AE6}' . + '\\x{10D24}-\\x{10D27}' . + '\\x{10EAB}-\\x{10EAC}' . + '\\x{10EFD}-\\x{10EFF}' . + '\\x{10F46}-\\x{10F50}' . + '\\x{10F82}-\\x{10F85}' . + '\\x{11000}' . + '\\x{11001}' . + '\\x{11002}' . + '\\x{11038}-\\x{11046}' . + '\\x{11070}' . + '\\x{11073}-\\x{11074}' . + '\\x{1107F}-\\x{11081}' . + '\\x{11082}' . + '\\x{110B0}-\\x{110B2}' . + '\\x{110B3}-\\x{110B6}' . + '\\x{110B7}-\\x{110B8}' . + '\\x{110B9}-\\x{110BA}' . + '\\x{110C2}' . + '\\x{11100}-\\x{11102}' . + '\\x{11127}-\\x{1112B}' . + '\\x{1112C}' . + '\\x{1112D}-\\x{11134}' . + '\\x{11145}-\\x{11146}' . + '\\x{11173}' . + '\\x{11180}-\\x{11181}' . + '\\x{11182}' . + '\\x{111B3}-\\x{111B5}' . + '\\x{111B6}-\\x{111BE}' . + '\\x{111BF}-\\x{111C0}' . + '\\x{111C9}-\\x{111CC}' . + '\\x{111CE}' . + '\\x{111CF}' . + '\\x{1122C}-\\x{1122E}' . + '\\x{1122F}-\\x{11231}' . + '\\x{11232}-\\x{11233}' . + '\\x{11234}' . + '\\x{11235}' . + '\\x{11236}-\\x{11237}' . + '\\x{1123E}' . + '\\x{11241}' . + '\\x{112DF}' . + '\\x{112E0}-\\x{112E2}' . + '\\x{112E3}-\\x{112EA}' . + '\\x{11300}-\\x{11301}' . + '\\x{11302}-\\x{11303}' . + '\\x{1133B}-\\x{1133C}' . + '\\x{1133E}-\\x{1133F}' . + '\\x{11340}' . + '\\x{11341}-\\x{11344}' . + '\\x{11347}-\\x{11348}' . + '\\x{1134B}-\\x{1134D}' . + '\\x{11357}' . + '\\x{11362}-\\x{11363}' . + '\\x{11366}-\\x{1136C}' . + '\\x{11370}-\\x{11374}' . + '\\x{11435}-\\x{11437}' . + '\\x{11438}-\\x{1143F}' . + '\\x{11440}-\\x{11441}' . + '\\x{11442}-\\x{11444}' . + '\\x{11445}' . + '\\x{11446}' . + '\\x{1145E}' . + '\\x{114B0}-\\x{114B2}' . + '\\x{114B3}-\\x{114B8}' . + '\\x{114B9}' . + '\\x{114BA}' . + '\\x{114BB}-\\x{114BE}' . + '\\x{114BF}-\\x{114C0}' . + '\\x{114C1}' . + '\\x{114C2}-\\x{114C3}' . + '\\x{115AF}-\\x{115B1}' . + '\\x{115B2}-\\x{115B5}' . + '\\x{115B8}-\\x{115BB}' . + '\\x{115BC}-\\x{115BD}' . + '\\x{115BE}' . + '\\x{115BF}-\\x{115C0}' . + '\\x{115DC}-\\x{115DD}' . + '\\x{11630}-\\x{11632}' . + '\\x{11633}-\\x{1163A}' . + '\\x{1163B}-\\x{1163C}' . + '\\x{1163D}' . + '\\x{1163E}' . + '\\x{1163F}-\\x{11640}' . + '\\x{116AB}' . + '\\x{116AC}' . + '\\x{116AD}' . + '\\x{116AE}-\\x{116AF}' . + '\\x{116B0}-\\x{116B5}' . + '\\x{116B6}' . + '\\x{116B7}' . + '\\x{1171D}-\\x{1171F}' . + '\\x{11720}-\\x{11721}' . + '\\x{11722}-\\x{11725}' . + '\\x{11726}' . + '\\x{11727}-\\x{1172B}' . + '\\x{1182C}-\\x{1182E}' . + '\\x{1182F}-\\x{11837}' . + '\\x{11838}' . + '\\x{11839}-\\x{1183A}' . + '\\x{11930}-\\x{11935}' . + '\\x{11937}-\\x{11938}' . + '\\x{1193B}-\\x{1193C}' . + '\\x{1193D}' . + '\\x{1193E}' . + '\\x{11940}' . + '\\x{11942}' . + '\\x{11943}' . + '\\x{119D1}-\\x{119D3}' . + '\\x{119D4}-\\x{119D7}' . + '\\x{119DA}-\\x{119DB}' . + '\\x{119DC}-\\x{119DF}' . + '\\x{119E0}' . + '\\x{119E4}' . + '\\x{11A01}-\\x{11A0A}' . + '\\x{11A33}-\\x{11A38}' . + '\\x{11A39}' . + '\\x{11A3B}-\\x{11A3E}' . + '\\x{11A47}' . + '\\x{11A51}-\\x{11A56}' . + '\\x{11A57}-\\x{11A58}' . + '\\x{11A59}-\\x{11A5B}' . + '\\x{11A8A}-\\x{11A96}' . + '\\x{11A97}' . + '\\x{11A98}-\\x{11A99}' . + '\\x{11C2F}' . + '\\x{11C30}-\\x{11C36}' . + '\\x{11C38}-\\x{11C3D}' . + '\\x{11C3E}' . + '\\x{11C3F}' . + '\\x{11C92}-\\x{11CA7}' . + '\\x{11CA9}' . + '\\x{11CAA}-\\x{11CB0}' . + '\\x{11CB1}' . + '\\x{11CB2}-\\x{11CB3}' . + '\\x{11CB4}' . + '\\x{11CB5}-\\x{11CB6}' . + '\\x{11D31}-\\x{11D36}' . + '\\x{11D3A}' . + '\\x{11D3C}-\\x{11D3D}' . + '\\x{11D3F}-\\x{11D45}' . + '\\x{11D47}' . + '\\x{11D8A}-\\x{11D8E}' . + '\\x{11D90}-\\x{11D91}' . + '\\x{11D93}-\\x{11D94}' . + '\\x{11D95}' . + '\\x{11D96}' . + '\\x{11D97}' . + '\\x{11EF3}-\\x{11EF4}' . + '\\x{11EF5}-\\x{11EF6}' . + '\\x{11F00}-\\x{11F01}' . + '\\x{11F03}' . + '\\x{11F34}-\\x{11F35}' . + '\\x{11F36}-\\x{11F3A}' . + '\\x{11F3E}-\\x{11F3F}' . + '\\x{11F40}' . + '\\x{11F41}' . + '\\x{11F42}' . + '\\x{13440}' . + '\\x{13447}-\\x{13455}' . + '\\x{16AF0}-\\x{16AF4}' . + '\\x{16B30}-\\x{16B36}' . + '\\x{16F4F}' . + '\\x{16F51}-\\x{16F87}' . + '\\x{16F8F}-\\x{16F92}' . + '\\x{16FE4}' . + '\\x{16FF0}-\\x{16FF1}' . + '\\x{1BC9D}-\\x{1BC9E}' . + '\\x{1CF00}-\\x{1CF2D}' . + '\\x{1CF30}-\\x{1CF46}' . + '\\x{1D165}-\\x{1D166}' . + '\\x{1D167}-\\x{1D169}' . + '\\x{1D16D}-\\x{1D172}' . + '\\x{1D17B}-\\x{1D182}' . + '\\x{1D185}-\\x{1D18B}' . + '\\x{1D1AA}-\\x{1D1AD}' . + '\\x{1D242}-\\x{1D244}' . + '\\x{1DA00}-\\x{1DA36}' . + '\\x{1DA3B}-\\x{1DA6C}' . + '\\x{1DA75}' . + '\\x{1DA84}' . + '\\x{1DA9B}-\\x{1DA9F}' . + '\\x{1DAA1}-\\x{1DAAF}' . + '\\x{1E000}-\\x{1E006}' . + '\\x{1E008}-\\x{1E018}' . + '\\x{1E01B}-\\x{1E021}' . + '\\x{1E023}-\\x{1E024}' . + '\\x{1E026}-\\x{1E02A}' . + '\\x{1E08F}' . + '\\x{1E130}-\\x{1E136}' . + '\\x{1E2AE}' . + '\\x{1E2EC}-\\x{1E2EF}' . + '\\x{1E4EC}-\\x{1E4EF}' . + '\\x{1E8D0}-\\x{1E8D6}' . + '\\x{1E944}-\\x{1E94A}' . + '\\x{1F3FB}-\\x{1F3FF}' . + '\\x{E0020}-\\x{E007F}' . + '\\x{E0100}-\\x{E01EF}', + 'ExtendNumLet' => + '\\x{005F}' . + '\\x{202F}' . + '\\x{203F}-\\x{2040}' . + '\\x{2054}' . + '\\x{FE33}-\\x{FE34}' . + '\\x{FE4D}-\\x{FE4F}' . + '\\x{FF3F}', + 'Format' => + '\\x{00AD}' . + '\\x{061C}' . + '\\x{180E}' . + '\\x{200E}-\\x{200F}' . + '\\x{202A}-\\x{202E}' . + '\\x{2060}-\\x{2064}' . + '\\x{2066}-\\x{206F}' . + '\\x{FEFF}' . + '\\x{FFF9}-\\x{FFFB}' . + '\\x{13430}-\\x{1343F}' . + '\\x{1BCA0}-\\x{1BCA3}' . + '\\x{1D173}-\\x{1D17A}' . + '\\x{E0001}', + 'Hebrew_Letter' => + '\\x{05D0}-\\x{05EA}' . + '\\x{05EF}-\\x{05F2}' . + '\\x{FB1D}' . + '\\x{FB1F}-\\x{FB28}' . + '\\x{FB2A}-\\x{FB36}' . + '\\x{FB38}-\\x{FB3C}' . + '\\x{FB3E}' . + '\\x{FB40}-\\x{FB41}' . + '\\x{FB43}-\\x{FB44}' . + '\\x{FB46}-\\x{FB4F}', 'Ideographic' => '\\x{3006}' . '\\x{3007}' . @@ -1701,6 +2930,131 @@ function utf8_regex_properties(): array '\\x{31350}-\\x{323AF}', 'Join_Control' => '\\x{200C}-\\x{200D}', + 'Katakana' => + '\\x{3031}-\\x{3035}' . + '\\x{309B}-\\x{309C}' . + '\\x{30A0}' . + '\\x{30A1}-\\x{30FA}' . + '\\x{30FC}-\\x{30FE}' . + '\\x{30FF}' . + '\\x{31F0}-\\x{31FF}' . + '\\x{32D0}-\\x{32FE}' . + '\\x{3300}-\\x{3357}' . + '\\x{FF66}-\\x{FF6F}' . + '\\x{FF70}' . + '\\x{FF71}-\\x{FF9D}' . + '\\x{1AFF0}-\\x{1AFF3}' . + '\\x{1AFF5}-\\x{1AFFB}' . + '\\x{1AFFD}-\\x{1AFFE}' . + '\\x{1B000}' . + '\\x{1B120}-\\x{1B122}' . + '\\x{1B155}' . + '\\x{1B164}-\\x{1B167}', + 'MidLetter' => + '\\x{003A}' . + '\\x{00B7}' . + '\\x{0387}' . + '\\x{055F}' . + '\\x{05F4}' . + '\\x{2027}' . + '\\x{FE13}' . + '\\x{FE55}' . + '\\x{FF1A}', + 'MidNum' => + '\\x{002C}' . + '\\x{003B}' . + '\\x{037E}' . + '\\x{0589}' . + '\\x{060C}-\\x{060D}' . + '\\x{066C}' . + '\\x{07F8}' . + '\\x{2044}' . + '\\x{FE10}' . + '\\x{FE14}' . + '\\x{FE50}' . + '\\x{FE54}' . + '\\x{FF0C}' . + '\\x{FF1B}', + 'MidNumLet' => + '\\x{002E}' . + '\\x{2018}' . + '\\x{2019}' . + '\\x{2024}' . + '\\x{FE52}' . + '\\x{FF07}' . + '\\x{FF0E}', + 'Numeric' => + '\\x{0030}-\\x{0039}' . + '\\x{0600}-\\x{0605}' . + '\\x{0660}-\\x{0669}' . + '\\x{066B}' . + '\\x{06DD}' . + '\\x{06F0}-\\x{06F9}' . + '\\x{07C0}-\\x{07C9}' . + '\\x{0890}-\\x{0891}' . + '\\x{08E2}' . + '\\x{0966}-\\x{096F}' . + '\\x{09E6}-\\x{09EF}' . + '\\x{0A66}-\\x{0A6F}' . + '\\x{0AE6}-\\x{0AEF}' . + '\\x{0B66}-\\x{0B6F}' . + '\\x{0BE6}-\\x{0BEF}' . + '\\x{0C66}-\\x{0C6F}' . + '\\x{0CE6}-\\x{0CEF}' . + '\\x{0D66}-\\x{0D6F}' . + '\\x{0DE6}-\\x{0DEF}' . + '\\x{0E50}-\\x{0E59}' . + '\\x{0ED0}-\\x{0ED9}' . + '\\x{0F20}-\\x{0F29}' . + '\\x{1040}-\\x{1049}' . + '\\x{1090}-\\x{1099}' . + '\\x{17E0}-\\x{17E9}' . + '\\x{1810}-\\x{1819}' . + '\\x{1946}-\\x{194F}' . + '\\x{19D0}-\\x{19D9}' . + '\\x{1A80}-\\x{1A89}' . + '\\x{1A90}-\\x{1A99}' . + '\\x{1B50}-\\x{1B59}' . + '\\x{1BB0}-\\x{1BB9}' . + '\\x{1C40}-\\x{1C49}' . + '\\x{1C50}-\\x{1C59}' . + '\\x{A620}-\\x{A629}' . + '\\x{A8D0}-\\x{A8D9}' . + '\\x{A900}-\\x{A909}' . + '\\x{A9D0}-\\x{A9D9}' . + '\\x{A9F0}-\\x{A9F9}' . + '\\x{AA50}-\\x{AA59}' . + '\\x{ABF0}-\\x{ABF9}' . + '\\x{FF10}-\\x{FF19}' . + '\\x{104A0}-\\x{104A9}' . + '\\x{10D30}-\\x{10D39}' . + '\\x{11066}-\\x{1106F}' . + '\\x{110BD}' . + '\\x{110CD}' . + '\\x{110F0}-\\x{110F9}' . + '\\x{11136}-\\x{1113F}' . + '\\x{111D0}-\\x{111D9}' . + '\\x{112F0}-\\x{112F9}' . + '\\x{11450}-\\x{11459}' . + '\\x{114D0}-\\x{114D9}' . + '\\x{11650}-\\x{11659}' . + '\\x{116C0}-\\x{116C9}' . + '\\x{11730}-\\x{11739}' . + '\\x{118E0}-\\x{118E9}' . + '\\x{11950}-\\x{11959}' . + '\\x{11C50}-\\x{11C59}' . + '\\x{11D50}-\\x{11D59}' . + '\\x{11DA0}-\\x{11DA9}' . + '\\x{11F50}-\\x{11F59}' . + '\\x{16A60}-\\x{16A69}' . + '\\x{16AC0}-\\x{16AC9}' . + '\\x{16B50}-\\x{16B59}' . + '\\x{1D7CE}-\\x{1D7FF}' . + '\\x{1E140}-\\x{1E149}' . + '\\x{1E2F0}-\\x{1E2F9}' . + '\\x{1E4F0}-\\x{1E4F9}' . + '\\x{1E950}-\\x{1E959}' . + '\\x{1FBF0}-\\x{1FBF9}', 'Regional_Indicator' => '\\x{1F1E6}-\\x{1F1FF}', 'Variation_Selector' => @@ -1708,6 +3062,13 @@ function utf8_regex_properties(): array '\\x{180F}' . '\\x{FE00}-\\x{FE0F}' . '\\x{E0100}-\\x{E01EF}', + 'WSegSpace' => + '\\x{0020}' . + '\\x{1680}' . + '\\x{2000}-\\x{2006}' . + '\\x{2008}-\\x{200A}' . + '\\x{205F}' . + '\\x{3000}', ]; } @@ -1715,8 +3076,8 @@ function utf8_regex_properties(): array * Helper function for utf8_sanitize_invisibles. * * Character class lists compiled from: - * https://unicode.org/Public/UNIDATA/StandardizedVariants.txt - * https://unicode.org/Public/UNIDATA/emoji/emoji-variation-sequences.txt + * https://www.unicode.org/Public/UCD/latest/ucd/StandardizedVariants.txt + * https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-variation-sequences.txt * * Developers: Do not update the data in this function manually. Instead, * run "php -f other/update_unicode_data.php" on the command line. @@ -3074,7 +4435,7 @@ function utf8_regex_variation_selectors(): array * Helper function for utf8_sanitize_invisibles. * * Character class lists compiled from: - * https://unicode.org/Public/UNIDATA/extracted/DerivedJoiningType.txt + * https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedJoiningType.txt * * Developers: Do not update the data in this function manually. Instead, * run "php -f other/update_unicode_data.php" on the command line. @@ -3366,8 +4727,8 @@ function utf8_regex_joining_type(): array * Helper function for utf8_sanitize_invisibles. * * Character class lists compiled from: - * https://unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt - * https://unicode.org/Public/UNIDATA/IndicSyllabicCategory.txt + * https://www.unicode.org/Public/UCD/latest/ucd/extracted/DerivedCombiningClass.txt + * https://www.unicode.org/Public/UCD/latest/ucd/IndicSyllabicCategory.txt * * Developers: Do not update the data in this function manually. Instead, * run "php -f other/update_unicode_data.php" on the command line. diff --git a/Sources/Unicode/Utf8String.php b/Sources/Unicode/Utf8String.php index d63b1df2b9..8486a2fd30 100644 --- a/Sources/Unicode/Utf8String.php +++ b/Sources/Unicode/Utf8String.php @@ -17,6 +17,7 @@ use SMF\Config; use SMF\Lang; use SMF\User; +use SMF\Utils; /** * A class for manipulating UTF-8 strings. @@ -653,9 +654,6 @@ public function sanitizeInvisibles(int $level, string $substitute): object * * Emoji characters count as words. Punctuation and other symbols do not. * - * @todo Improve the fallback code we use when the IntlBreakIterator class - * is unavailable. - * * @param int $level See documentation for Utf8String::sanitizeInvisibles(). * @return array The words in this string. */ @@ -676,24 +674,23 @@ public function extractWords(int $level): array // Normalize the whitespace. $this->string = \SMF\Utils::normalizeSpaces($this->string, true, true, ['replace_tabs' => true, 'collapse_hspace' => true]); - // Preserve emoji characters, variation selectors, and join controls. + // Sanitize variation selectors and join controls. $placeholders = []; - $this->preserveEmoji($placeholders); $this->sanitizeVariationSelectors($placeholders, ' '); $this->sanitizeJoinControls($placeholders, $level, ' '); - // Remove the private use characters that delimit the placeholders - // so that they don't interfere with the word splitting. - foreach ($placeholders as $key => $placeholder) { - $simple_placeholder = sha1($placeholder); - $this->string = str_replace($placeholder, $simple_placeholder, $this->string); - $placeholders[$key] = $simple_placeholder; + if (!empty($placeholders)) { + $this->string = strtr($this->string, array_flip($placeholders)); } + // We'll need these one way or another. + require_once __DIR__ . '/RegularExpressions.php'; + $prop_classes = utf8_regex_properties(); + // Split into words, with Unicode awareness. // Prefer IntlBreakIterator if it is available. if (class_exists('IntlBreakIterator')) { - $break_iterator = \IntlBreakIterator::createWordInstance(); + $break_iterator = \IntlBreakIterator::createWordInstance(Lang::getLocaleFromLanguageName(Config::$language)); $break_iterator->setText($this->string); $parts_interator = $break_iterator->getPartsIterator(); @@ -704,30 +701,276 @@ public function extractWords(int $level): array } } else { /* - * This is a sad, weak substitute for the IntlBreakIterator. - * It works well enough for European languages, but it fails badly - * for many Asian languages. To improve it will require adding more - * data to our Unicode data files and then writing code to implement - * the Unicode word break algorithm. + * This implements the default Unicode word break algorithm. + * It does not adapt to different locales. * See https://www.unicode.org/reports/tr29/#Word_Boundaries */ - $words = preg_split('/(?string); - } + $chars = preg_split('/(.)/su', $string, 0, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); - foreach ($words as $key => $word) { - $word = trim($word); + foreach ($chars as $i => $char) { + $chars[$i] = [ + 'char' => $char, + 'break_after' => false, + ]; + } - if (preg_replace('/\W/u', '', $word) === '') { - unset($words[$key]); + for ($i = 0; $i < count($chars); $i++) { + $substring_before = implode('', array_slice(array_map(fn ($char) => $char['char'], $chars), 0, $i)); + $substring_after = implode('', array_slice(array_map(fn ($char) => $char['char'], $chars), $i)); - continue; + // Do not break within CRLF. + if ($chars[$i]['char'] === "\r" && isset($chars[$i + 1]) && $chars[$i + 1]['char'] === "\n") { + $chars[$i]['break_after'] = false; + continue; + } + + // Otherwise break before and after line breaks. + if (preg_match('/\v/u', $char)) { + $chars[$i]['break_after'] = true; + continue; + } + + // Do not break within emoji zwj sequences. + if (preg_match('/^\x{200D}[' . $prop_classes['Emoji'] . ']/u', $substring_after)) { + $chars[$i]['break_after'] = false; + continue; + } + + // Keep horizontal whitespace together. + if (preg_match('/^[' . $prop_classes['WSegSpace'] . ']{2}/u', $substring_after)) { + $chars[$i]['break_after'] = false; + continue; + } + + // Ignore Format and Extend characters, except after start of text and line breaks. + if ( + preg_match( + '/^\V([' . $prop_classes['Extend'] . $prop_classes['Format'] . '\x{200D}]+)/u', + $substring_after, + $matches, + ) + ) { + // Don't break before the extending character. + $chars[$i]['break_after'] = false; + + // Don't break after the extending characters (except perhaps the last one). + for ($j = 1; $j <= mb_strlen($matches[1]); $j++) { + $chars[$i + $j]['break_after'] = false; + } + + // Test consists of the characters before and after the extending characters. + if (isset($chars[$i + $j + 1])) { + $test_string .= $chars[$i]['char'] . $chars[$i + $j + 1]['char']; + + $current_string = $this->string; + $this->string = $test_string; + + // Set the break_after of the last extender to whether there + // would be a break if the extenders were not present. + $chars[$i + $j]['break_after'] = count($this->extractWords($level)) > 1; + + $this->string = $current_string; + } + + $i += $j; + + continue; + } + + // Do not break between most letters. + if (preg_match('/^[' . $prop_classes['ALetter'] . $prop_classes['Hebrew_Letter'] . ']{2}/u', $substring_after)) { + $chars[$i]['break_after'] = false; + continue; + } + + // Do not break letters across certain punctuation, such as within "e.g." or "example.com". + if ( + preg_match( + '/^' . + '[' . $prop_classes['ALetter'] . $prop_classes['Hebrew_Letter'] . ']' . + '[' . $prop_classes['MidLetter'] . $prop_classes['MidNumLet'] . '\']' . + '[' . $prop_classes['ALetter'] . $prop_classes['Hebrew_Letter'] . ']' . + '/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + $chars[++$i]['break_after'] = false; + continue; + } + + if ( + preg_match( + '/^[' . $prop_classes['Hebrew_Letter'] . ']\'/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + if ( + preg_match( + '/^' . + '[' . $prop_classes['Hebrew_Letter'] . ']' . + '"' . + '[' . $prop_classes['Hebrew_Letter'] . ']' . + '/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + $chars[++$i]['break_after'] = false; + continue; + } + + // Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”). + if ( + preg_match( + '/^[' . $prop_classes['Numeric'] . ']{2}/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + if ( + preg_match( + '/^[' . $prop_classes['ALetter'] . '][' . $prop_classes['Numeric'] . ']/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + if ( + preg_match( + '/^' . + '[' . $prop_classes['Numeric'] . ']' . + '[' . $prop_classes['ALetter'] . ']' . + '/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + // Do not break within sequences, such as “3.2” or “3,456.789”. + if ( + preg_match( + '/^' . + '[' . $prop_classes['Numeric'] . ']' . + '[' . $prop_classes['MidNum'] . $prop_classes['MidNumLet'] . '\']' . + '[' . $prop_classes['Numeric'] . ']' . + '/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + if ( + preg_match( + '/[' . $prop_classes['Numeric'] . ']$/u', + $substring_before, + ) + && preg_match( + '/^' . + '[' . $prop_classes['MidNum'] . $prop_classes['MidNumLet'] . '\']' . + '[' . $prop_classes['Numeric'] . ']' . + '/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + // Do not break between Katakana. + if ( + preg_match( + '/^[' . $prop_classes['Katakana'] . '][' . $prop_classes['Katakana'] . ']/u', + $substring_after, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + // Do not break from extenders. + if ( + preg_match( + '/^' . + '[' . $prop_classes['ALetter'] . $prop_classes['Hebrew_Letter'] . $prop_classes['Numeric'] . $prop_classes['Katakana'] . $prop_classes['ExtendNumLet'] . ']' . + '[' . $prop_classes['ExtendNumLet'] . ']' . + '/u', + $substring_after, + $matches, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + if ( + preg_match( + '/^' . + '[' . $prop_classes['ExtendNumLet'] . ']' . + '[' . $prop_classes['ALetter'] . $prop_classes['Hebrew_Letter'] . $prop_classes['Numeric'] . $prop_classes['Katakana'] . $prop_classes['ExtendNumLet'] . ']' . + '/u', + $substring_after, + $matches, + ) + ) { + $chars[$i]['break_after'] = false; + continue; + } + + // Do not break within emoji flag sequences. + if ( + preg_match( + '/^[' . $prop_classes['Regional_Indicator'] . ']/u', + $substring_after, + ) + && preg_match( + '/[' . $prop_classes['Regional_Indicator'] . ']*$/u', + $substring_before, + $matches, + ) + ) { + $chars[$i]['break_after'] = mb_strlen($matches[0]) % 2 === 1; + continue; + } + + // Otherwise, break everywhere (including around ideographs). + $chars[$i]['break_after'] = true; } - if (!empty($placeholders)) { - $word = strtr($word, array_flip($placeholders)); + // Build the list of words. + $words = []; + $word = ''; + + foreach ($chars as $char) { + $word .= $char['char']; + + if ($char['break_after']) { + $words[] = $word; + $word = ''; + } } + } - $words[$key] = $word; + foreach ($words as $key => $word) { + $word = Utils::htmlTrim($word); + + // Filter out punctuation marks, etc. + if (preg_replace('/[^\w' . $prop_classes['Regional_Indicator'] . $prop_classes['Emoji'] . $prop_classes['Emoji_Modifier'] . ']/u', '', $word) === '') { + unset($words[$key]); + } } // Restore the original version of the string.