diff --git a/icu4c/source/common/uniset_props.cpp b/icu4c/source/common/uniset_props.cpp index 6f9341d97fa4..175a96f7130b 100644 --- a/icu4c/source/common/uniset_props.cpp +++ b/icu4c/source/common/uniset_props.cpp @@ -693,6 +693,7 @@ class UnicodeSet::Lexer { std::optional queryOperatorPosition; int32_t queryExpressionStart = parsePosition_.getIndex(); bool exteriorlyNegated = false; + bool interiorlyNegated = false; UBool unusedEscaped; // Do not skip whitespace so we can recognize unspaced :]. Lex escapes and // named-element: while ICU does not support string-valued properties and thus has no @@ -742,7 +743,14 @@ class UnicodeSet::Lexer { // Neither a named-element nor an escaped-element can be part of a closing :]. lastUnescaped = -1; } else if (!queryOperatorPosition.has_value() && lastUnescaped == u'=') { - // TODO(egg): Propose and add support for ≠. + queryOperatorPosition = parsePosition_.getIndex() - 1; + } else if (!queryOperatorPosition.has_value() && lastUnescaped == u'≠') { + if (exteriorlyNegated) { + // Reject doubly negated property queries. + errorCode = U_ILLEGAL_ARGUMENT_ERROR; + return {}; + } + interiorlyNegated = true; queryOperatorPosition = parsePosition_.getIndex() - 1; } else if ((first == u'[' && penultimateUnescaped == u':' && lastUnescaped == u']') || (first == u'\\' && lastUnescaped == u'}')) { @@ -772,7 +780,7 @@ class UnicodeSet::Lexer { pattern_.tempSubStringBetween(queryExpressionStart, queryOperatorPosition.value_or(queryExpressionLimit)), propertyPredicate, errorCode); - if (exteriorlyNegated) { + if (exteriorlyNegated != interiorlyNegated) { result.complement().removeAllStrings(); } result.setPattern(pattern_.tempSubStringBetween(queryStart, parsePosition_.getIndex())); diff --git a/icu4c/source/test/intltest/usettest.cpp b/icu4c/source/test/intltest/usettest.cpp index 133070c62ca2..cd48d9216142 100644 --- a/icu4c/source/test/intltest/usettest.cpp +++ b/icu4c/source/test/intltest/usettest.cpp @@ -1118,6 +1118,34 @@ void UnicodeSetTest::TestPropertySet() { expectContainment(UnicodeString(DATA[i], -1, US_INV), CharsToUnicodeString(DATA[i+1]), CharsToUnicodeString(DATA[i+2])); } + { + UErrorCode status = U_ZERO_ERROR; + UnicodeSet s1(u"[:Noncharacter_Code_Point≠No:]", status); + UnicodeSet s2(u"[:Noncharacter_Code_Point:]", status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(s1 == s2); + } + { + UErrorCode status = U_ZERO_ERROR; + UnicodeSet s1(uR"(\p{Noncharacter_Code_Point≠No})", status); + UnicodeSet s2(uR"(\p{Noncharacter_Code_Point})", status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(s1 == s2); + } + { + UErrorCode status = U_ZERO_ERROR; + UnicodeSet s1(uR"(\p{dt≠can})", status); + UnicodeSet s2(uR"(\P{dt=can})", status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(s1 == s2); + } + { + UErrorCode status = U_ZERO_ERROR; + UnicodeSet s1(uR"([:dt≠can:])", status); + UnicodeSet s2(uR"([:^dt=can:])", status); + TEST_ASSERT_SUCCESS(status); + TEST_ASSERT(s1 == s2); + } } /** @@ -4778,6 +4806,9 @@ void UnicodeSetTest::TestParseErrors() { uR"(\p{Uppercase_Letter=})", // Well-formed in ICU 78 and earlier, disallowed by ICU-23306. uR"([: ^general category = punctuation :])", + // Doubly negated property queries. + uR"(\P{Decomposition_Type≠compat})", + u"[:^Noncharacter_Code_Point≠No:]", }) { UErrorCode errorCode = U_ZERO_ERROR; const UnicodeSet set(expression, errorCode);