@@ -1750,11 +1750,226 @@ pg_utf8_verifychar(const unsigned char *s, int len)
17501750 return l ;
17511751}
17521752
1753+ /*
1754+ * The fast path of the UTF-8 verifier uses a deterministic finite automaton
1755+ * (DFA) for multibyte characters. In a traditional table-driven DFA, the
1756+ * input byte and current state are used to compute an index into an array of
1757+ * state transitions. Since the address of the next transition is dependent
1758+ * on this computation, there is latency in executing the load instruction,
1759+ * and the CPU is not kept busy.
1760+ *
1761+ * Instead, we use a "shift-based" DFA as described by Per Vognsen:
1762+ *
1763+ * https://gist.github.com/pervognsen/218ea17743e1442e59bb60d29b1aa725
1764+ *
1765+ * In a shift-based DFA, the input byte is an index into array of integers
1766+ * whose bit pattern encodes the state transitions. To compute the next
1767+ * state, we simply right-shift the integer by the current state and apply a
1768+ * mask. In this scheme, the address of the transition only depends on the
1769+ * input byte, so there is better pipelining.
1770+ *
1771+ * The naming convention for states and transitions was adopted from a UTF-8
1772+ * to UTF-16/32 transcoder, whose table is reproduced below:
1773+ *
1774+ * https://github.com/BobSteagall/utf_utils/blob/6b7a465265de2f5fa6133d653df0c9bdd73bbcf8/src/utf_utils.cpp
1775+ *
1776+ * ILL ASC CR1 CR2 CR3 L2A L3A L3B L3C L4A L4B L4C CLASS / STATE
1777+ * ==========================================================================
1778+ * err, END, err, err, err, CS1, P3A, CS2, P3B, P4A, CS3, P4B, | BGN/END
1779+ * err, err, err, err, err, err, err, err, err, err, err, err, | ERR
1780+ * |
1781+ * err, err, END, END, END, err, err, err, err, err, err, err, | CS1
1782+ * err, err, CS1, CS1, CS1, err, err, err, err, err, err, err, | CS2
1783+ * err, err, CS2, CS2, CS2, err, err, err, err, err, err, err, | CS3
1784+ * |
1785+ * err, err, err, err, CS1, err, err, err, err, err, err, err, | P3A
1786+ * err, err, CS1, CS1, err, err, err, err, err, err, err, err, | P3B
1787+ * |
1788+ * err, err, err, CS2, CS2, err, err, err, err, err, err, err, | P4A
1789+ * err, err, CS2, err, err, err, err, err, err, err, err, err, | P4B
1790+ *
1791+ * In the most straightforward implementation, a shift-based DFA for UTF-8
1792+ * requires 64-bit integers to encode the transitions, but with an SMT solver
1793+ * it's possible to find state numbers such that the transitions fit within
1794+ * 32-bit integers, as Dougall Johnson demonstrated:
1795+ *
1796+ * https://gist.github.com/dougallj/166e326de6ad4cf2c94be97a204c025f
1797+ *
1798+ * This packed representation is the reason for the seemingly odd choice of
1799+ * state values below.
1800+ */
1801+
1802+ /* Error */
1803+ #define ERR 0
1804+ /* Begin */
1805+ #define BGN 11
1806+ /* Continuation states, expect 1/2/3 continuation bytes */
1807+ #define CS1 16
1808+ #define CS2 1
1809+ #define CS3 5
1810+ /* Leading byte was E0/ED, expect 1 more continuation byte */
1811+ #define P3A 6
1812+ #define P3B 20
1813+ /* Leading byte was F0/F4, expect 2 more continuation bytes */
1814+ #define P4A 25
1815+ #define P4B 30
1816+ /* Begin and End are the same state */
1817+ #define END BGN
1818+
1819+ /* the encoded state transitions for the lookup table */
1820+
1821+ /* ASCII */
1822+ #define ASC (END << BGN)
1823+ /* 2-byte lead */
1824+ #define L2A (CS1 << BGN)
1825+ /* 3-byte lead */
1826+ #define L3A (P3A << BGN)
1827+ #define L3B (CS2 << BGN)
1828+ #define L3C (P3B << BGN)
1829+ /* 4-byte lead */
1830+ #define L4A (P4A << BGN)
1831+ #define L4B (CS3 << BGN)
1832+ #define L4C (P4B << BGN)
1833+ /* continuation byte */
1834+ #define CR1 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4B)
1835+ #define CR2 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3B) | (CS2 << P4A)
1836+ #define CR3 (END << CS1) | (CS1 << CS2) | (CS2 << CS3) | (CS1 << P3A) | (CS2 << P4A)
1837+ /* invalid byte */
1838+ #define ILL ERR
1839+
1840+ static const uint32 Utf8Transition [256 ] =
1841+ {
1842+ /* ASCII */
1843+
1844+ ILL , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1845+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1846+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1847+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1848+
1849+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1850+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1851+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1852+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1853+
1854+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1855+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1856+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1857+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1858+
1859+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1860+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1861+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1862+ ASC , ASC , ASC , ASC , ASC , ASC , ASC , ASC ,
1863+
1864+ /* continuation bytes */
1865+
1866+ /* 80..8F */
1867+ CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 ,
1868+ CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 , CR1 ,
1869+
1870+ /* 90..9F */
1871+ CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 ,
1872+ CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 , CR2 ,
1873+
1874+ /* A0..BF */
1875+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1876+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1877+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1878+ CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 , CR3 ,
1879+
1880+ /* leading bytes */
1881+
1882+ /* C0..DF */
1883+ ILL , ILL , L2A , L2A , L2A , L2A , L2A , L2A ,
1884+ L2A , L2A , L2A , L2A , L2A , L2A , L2A , L2A ,
1885+ L2A , L2A , L2A , L2A , L2A , L2A , L2A , L2A ,
1886+ L2A , L2A , L2A , L2A , L2A , L2A , L2A , L2A ,
1887+
1888+ /* E0..EF */
1889+ L3A , L3B , L3B , L3B , L3B , L3B , L3B , L3B ,
1890+ L3B , L3B , L3B , L3B , L3B , L3C , L3B , L3B ,
1891+
1892+ /* F0..FF */
1893+ L4A , L4B , L4B , L4B , L4C , ILL , ILL , ILL ,
1894+ ILL , ILL , ILL , ILL , ILL , ILL , ILL , ILL
1895+ };
1896+
1897+ static void
1898+ utf8_advance (const unsigned char * s , uint32 * state , int len )
1899+ {
1900+ /* Note: We deliberately don't check the state's value here. */
1901+ while (len > 0 )
1902+ {
1903+ /*
1904+ * It's important that the mask value is 31: In most instruction sets,
1905+ * a shift by a 32-bit operand is understood to be a shift by its mod
1906+ * 32, so the compiler should elide the mask operation.
1907+ */
1908+ * state = Utf8Transition [* s ++ ] >> (* state & 31 );
1909+ len -- ;
1910+ }
1911+
1912+ * state &= 31 ;
1913+ }
1914+
17531915static int
17541916pg_utf8_verifystr (const unsigned char * s , int len )
17551917{
17561918 const unsigned char * start = s ;
1919+ const int orig_len = len ;
1920+ uint32 state = BGN ;
1921+
1922+ /*
1923+ * Sixteen seems to give the best balance of performance across different
1924+ * byte distributions.
1925+ */
1926+ #define STRIDE_LENGTH 16
1927+
1928+ if (len >= STRIDE_LENGTH )
1929+ {
1930+ while (len >= STRIDE_LENGTH )
1931+ {
1932+ /*
1933+ * If the chunk is all ASCII, we can skip the full UTF-8 check,
1934+ * but we must first check for a non-END state, which means the
1935+ * previous chunk ended in the middle of a multibyte sequence.
1936+ */
1937+ if (state != END || !is_valid_ascii (s , STRIDE_LENGTH ))
1938+ utf8_advance (s , & state , STRIDE_LENGTH );
1939+
1940+ s += STRIDE_LENGTH ;
1941+ len -= STRIDE_LENGTH ;
1942+ }
1943+
1944+ /*
1945+ * The error state persists, so we only need to check for it here. In
1946+ * case of error we start over from the beginning with the slow path
1947+ * so we can count the valid bytes.
1948+ */
1949+ if (state == ERR )
1950+ {
1951+ len = orig_len ;
1952+ s = start ;
1953+ }
1954+
1955+ /*
1956+ * We treat all other states as success, but it's possible the fast
1957+ * path exited in the middle of a multibyte sequence, since that
1958+ * wouldn't have caused an error. Before checking the remaining bytes,
1959+ * walk backwards to find the last byte that could have been the start
1960+ * of a valid sequence.
1961+ */
1962+ while (s > start )
1963+ {
1964+ s -- ;
1965+ len ++ ;
1966+
1967+ if (!IS_HIGHBIT_SET (* s ) || pg_utf_mblen (s ) > 1 )
1968+ break ;
1969+ }
1970+ }
17571971
1972+ /* check remaining bytes */
17581973 while (len > 0 )
17591974 {
17601975 int l ;
0 commit comments