Skip to content

Commit b783686

Browse files
committed
optimizes string matching by allowing memcmp like functionality (even on utf8 sequences)
reference: hanickadot#147 comparison: https://compiler-explorer.com/z/Tz3KhG
1 parent 9a37e55 commit b783686

File tree

2 files changed

+85
-4
lines changed

2 files changed

+85
-4
lines changed

include/ctre/evaluation.hpp

+47-4
Original file line numberDiff line numberDiff line change
@@ -115,11 +115,54 @@ template <typename CharT, typename Iterator, typename EndIterator> constexpr CTR
115115
return false;
116116
}
117117

118-
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {
119-
120-
bool same = (compare_character(String, current, end) && ... && true);
118+
#if __cpp_char8_t >= 201811
119+
template <size_t N, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_utf8_string(Iterator current, [[maybe_unused]] const EndIterator end, char8_t (&buffer)[N], std::index_sequence<Idx...>) noexcept {
120+
//abuse inside knowledge of how utf8_iterator works
121+
if constexpr (!std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>) {
122+
size_t count = end.ptr - current.ptr; //size_t count = std::distance(current.ptr, end.ptr);
123+
size_t bump = ((count < N) ? count : N);
124+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && (((static_cast<char8_t>(current.ptr[Idx] != buffer[Idx])) | ... | char8_t{0}) == 0) };
125+
} else {
126+
size_t count = current.end - current.ptr; //size_t count = std::distance(current.ptr, current.end);
127+
size_t bump = ((count < N) ? count : N);
128+
return { Iterator{current.ptr + bump, current.end}, (count >= N) && ((((static_cast<char8_t>(current.ptr[Idx] != buffer[Idx])) | ... | char8_t{0}) == 0) };
129+
}
130+
}
131+
#endif
121132

122-
return {current, same};
133+
template <auto... String, size_t... Idx, typename Iterator, typename EndIterator> constexpr CTRE_FORCE_INLINE string_match_result<Iterator> evaluate_match_string(Iterator current, [[maybe_unused]] const EndIterator end, std::index_sequence<Idx...>) noexcept {
134+
#if __cpp_char8_t >= 201811
135+
if constexpr (sizeof...(String) && std::is_same_v<::std::remove_const_t<Iterator>, utf8_iterator> && (std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>> || std::is_same_v<::std::remove_const_t<EndIterator>, utf8_iterator::sentinel>)) {
136+
constexpr size_t str_length = (utf8_codepoint_length(String) + ... + 0ULL);
137+
//encode our String... into it's utf8 representation
138+
char8_t utf8_sequence[str_length];
139+
char8_t* ptr = utf8_sequence;
140+
((ptr = utf32_codepoint_to_utf8_codepoint(String, ptr)), ...);
141+
//run the comparison
142+
return evaluate_match_utf8_string(current, end, utf8_sequence, std::make_index_sequence<str_length>());
143+
} else if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
144+
using char_type = ::std::remove_reference_t<::std::remove_cv_t<decltype(*current)>>;
145+
//check the remaining bytes*
146+
size_t count = end - current;
147+
//make sure we only "bump" the iterator a safe distance
148+
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
149+
//do math against how many characters we match, avoid as many branches as possible
150+
return { current + bump, (count >= sizeof...(String)) && (((static_cast<char_type>(current[Idx] != static_cast<char_type>(String))) | ... | static_cast<char_type>(0)) == 0) };
151+
} else {
152+
bool same = (compare_character(String, current, end) && ... && true);
153+
return { current, same };
154+
}
155+
#else
156+
if constexpr (sizeof...(String) && is_random_accessible(typename std::iterator_traits<Iterator>::iterator_category{}) && std::is_same_v<std::remove_const_t<Iterator>, std::remove_const_t<EndIterator>>) {
157+
using char_type = ::std::remove_reference_t<::std::remove_cv_t<decltype(*current)>>;
158+
size_t count = end - current;
159+
size_t bump = ((count < sizeof...(String)) ? count : sizeof...(String));
160+
return { current + bump, (count >= sizeof...(String)) && (((static_cast<char_type>(current[Idx] != static_cast<char_type>(String))) | ... | static_cast<char_type>(0)) == 0) };
161+
} else {
162+
bool same = (compare_character(String, current, end) && ... && true);
163+
return { current, same };
164+
}
165+
#endif
123166
}
124167

125168
template <typename R, typename Iterator, typename EndIterator, auto... String, typename... Tail>

include/ctre/utf8.hpp

+38
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,44 @@
88
#include <iterator>
99

1010
namespace ctre {
11+
constexpr char8_t* utf32_codepoint_to_utf8_codepoint(uint32_t code, char8_t *ptr) {
12+
if (code < 0x80) {
13+
ptr[0] = code;
14+
return ptr + 1;
15+
} else if (code < 0x800) { // 00000yyy yyxxxxxx
16+
ptr[0] = (0b11000000 | (code >> 6));
17+
ptr[1] = (0b10000000 | (code & 0x3f));
18+
return ptr + 2;
19+
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
20+
ptr[0] = (0b11100000 | (code >> 12)); // 1110zzz
21+
ptr[1] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyy
22+
ptr[2] = (0b10000000 | (code & 0x3f)); // 10xxxxx
23+
return ptr + 3;
24+
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
25+
ptr[0] = (0b11110000 | (code >> 18)); // 11110uuu
26+
ptr[1] = (0b10000000 | ((code >> 12) & 0x3f)); // 10uuzzzz
27+
ptr[2] = (0b10000000 | ((code >> 6) & 0x3f)); // 10yyyyyy
28+
ptr[3] = (0b10000000 | (code & 0x3f)); // 10xxxxxx
29+
return ptr + 4;
30+
} else {
31+
ptr[0] = 0xff; //invalid start byte
32+
return ptr + 1;
33+
}
34+
}
35+
36+
constexpr uint32_t utf8_codepoint_length(uint32_t code) {
37+
if (code < 0x80) {
38+
return 1;
39+
} else if (code < 0x800) { // 00000yyy yyxxxxxx
40+
return 2;
41+
} else if (code < 0x10000) { // zzzzyyyy yyxxxxxx
42+
return 3;
43+
} else if (code < 0x200000) { // 000uuuuu zzzzyyyy yyxxxxxx
44+
return 4;
45+
} else {
46+
return 1;
47+
}
48+
}
1149

1250
struct utf8_iterator {
1351
using self_type = utf8_iterator;

0 commit comments

Comments
 (0)