# Just give me a test program
OK. Name this program `test.c`:
```c
#include <string.h>
#include "sqlite3.h"
const char* schema = ""
"CREATE VIRTUAL TABLE search USING fts5("
" title,"
" prefix='1 2 3',"
" content='')";
int main(int argc, char** argv)
{
sqlite3* db;
/* Open a database connection */
if (sqlite3_open_v2(":memory:", &db, SQLITE_OPEN_READWRITE, 0) != SQLITE_OK) {
return 1;
};
/* Create the schema */
if (sqlite3_exec(db, schema, 0, 0, 0) != SQLITE_OK) {
return 2;
}
/* Prepare an INSERT */
const char *sql = "INSERT INTO search (rowid, title) VALUES (?, ?)";
const char *tail;
sqlite3_stmt *stmt;
if (sqlite3_prepare_v2(db, sql, strlen(sql), &stmt, &tail) != SQLITE_OK) {
sqlite3_close_v2(db);
return 3;
}
/* Bind the rowid */
if (sqlite3_bind_int64(stmt, 1, 1000) != SQLITE_OK) {
sqlite3_close_v2(db);
sqlite3_finalize(stmt);
return 4;
}
/* Bind the title */
const char *title = "まりや";
if (sqlite3_bind_text(stmt, 2, title, strlen(title), 0) != SQLITE_OK) {
sqlite3_close_v2(db);
sqlite3_finalize(stmt);
return 5;
}
/* Run the insert */
while (sqlite3_step(stmt) != SQLITE_DONE) {
}
return 0;
}
```
Link this against the SQLite amalgamation (go to https://www.sqlite.org/download.html and select the amalgamation option). Turn on ASan and FTS5:
```
gcc -g -fsanitize=address -DSQLITE_ENABLE_FTS5 -Wall test.c sqlite3.c -ldl -lpthread -lm
```
Then run it. You will see something like this:
```
=================================================================
==3637==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x607000000b58 at pc 0x55f32bfd3d81 bp 0x7ffdc62f58f0 sp 0x7ffdc62f58e0
READ of size 1 at 0x607000000b58 thread T0
#0 0x55f32bfd3d80 in sqlite3Fts5IndexCharlenToBytelen /home/boom/vendor/sqlite3/sqlite3.c:203708
#1 0x55f32bfd3ff7 in sqlite3Fts5IndexWrite /home/boom/vendor/sqlite3/sqlite3.c:203760
#2 0x55f32bfea69e in fts5StorageInsertCallback /home/boom/vendor/sqlite3/sqlite3.c:208084
#3 0x55f32bff22f9 in fts5UnicodeTokenize /home/boom/vendor/sqlite3/sqlite3.c:209323
#4 0x55f32bf9d22b in sqlite3Fts5Tokenize /home/boom/vendor/sqlite3/sqlite3.c:194822
#5 0x55f32bfed454 in sqlite3Fts5StorageIndexInsert /home/boom/vendor/sqlite3/sqlite3.c:208437
#6 0x55f32bfe12b1 in fts5StorageInsert /home/boom/vendor/sqlite3/sqlite3.c:206408
#7 0x55f32bfe1aca in fts5UpdateMethod /home/boom/vendor/sqlite3/sqlite3.c:206511
#8 0x55f32be8b879 in sqlite3VdbeExec /home/boom/vendor/sqlite3/sqlite3.c:88868
#9 0x55f32be6a4b9 in sqlite3Step /home/boom/vendor/sqlite3/sqlite3.c:80302
#10 0x55f32be6abb3 in sqlite3_step /home/boom/vendor/sqlite3/sqlite3.c:80365
#11 0x55f32bdc1b8f in main /home/boom/vendor/sqlite3/test.c:50
#12 0x7fe80d8d6b96 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b96)
#13 0x55f32bdc1719 in _start (/home/boom/vendor/sqlite3/a.out+0x23719)
0x607000000b58 is located 0 bytes to the right of 72-byte region [0x607000000b10,0x607000000b58)
allocated by thread T0 here:
#0 0x7fe80e545b50 in __interceptor_malloc (/usr/lib/x86_64-linux-gnu/libasan.so.4+0xdeb50)
#1 0x55f32bdcc134 in sqlite3MemMalloc /home/boom/vendor/sqlite3/sqlite3.c:22262
#2 0x55f32bdcd018 in mallocWithAlarm /home/boom/vendor/sqlite3/sqlite3.c:26094
#3 0x55f32bdcd17c in sqlite3Malloc /home/boom/vendor/sqlite3/sqlite3.c:26124
#4 0x55f32bdcd269 in sqlite3_malloc /home/boom/vendor/sqlite3/sqlite3.c:26142
#5 0x55f32bff1306 in fts5UnicodeCreate /home/boom/vendor/sqlite3/sqlite3.c:209184
#6 0x55f32bfe82a6 in sqlite3Fts5GetTokenizer /home/boom/vendor/sqlite3/sqlite3.c:207530
#7 0x55f32bf9afe9 in fts5ConfigDefaultTokenizer /home/boom/vendor/sqlite3/sqlite3.c:194501
#8 0x55f32bf9c5ac in sqlite3Fts5ConfigParse /home/boom/vendor/sqlite3/sqlite3.c:194700
#9 0x55f32bfdadd6 in fts5InitVtab /home/boom/vendor/sqlite3/sqlite3.c:205324
#10 0x55f32bfdb463 in fts5CreateMethod /home/boom/vendor/sqlite3/sqlite3.c:205390
#11 0x55f32bf3c919 in vtabCallConstructor /home/boom/vendor/sqlite3/sqlite3.c:131129
#12 0x55f32bf3dc2c in sqlite3VtabCallCreate /home/boom/vendor/sqlite3/sqlite3.c:131299
#13 0x55f32be8a071 in sqlite3VdbeExec /home/boom/vendor/sqlite3/sqlite3.c:88544
#14 0x55f32be6a4b9 in sqlite3Step /home/boom/vendor/sqlite3/sqlite3.c:80302
#15 0x55f32be6abb3 in sqlite3_step /home/boom/vendor/sqlite3/sqlite3.c:80365
#16 0x55f32befdebc in sqlite3_exec /home/boom/vendor/sqlite3/sqlite3.c:115729
#17 0x55f32bdc193b in main /home/boom/vendor/sqlite3/test.c:20
#18 0x7fe80d8d6b96 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b96)
SUMMARY: AddressSanitizer: heap-buffer-overflow /home/boom/vendor/sqlite3/sqlite3.c:203708 in sqlite3Fts5IndexCharlenToBytelen
Shadow bytes around the buggy address:
0x0c0e7fff8110: fd fd fd fd fd fd fd fd fa fa fa fa fd fd fd fd
0x0c0e7fff8120: fd fd fd fd fd fd fa fa fa fa fd fd fd fd fd fd
0x0c0e7fff8130: fd fd fd fd fa fa fa fa fd fd fd fd fd fd fd fd
0x0c0e7fff8140: fd fd fa fa fa fa fd fd fd fd fd fd fd fd fd fd
0x0c0e7fff8150: fa fa fa fa 00 00 00 00 00 00 00 00 00 fa fa fa
=>0x0c0e7fff8160: fa fa 00 00 00 00 00 00 00 00 00[fa]fa fa fa fa
0x0c0e7fff8170: 00 00 00 00 00 00 00 00 00 fa fa fa fa fa fd fd
0x0c0e7fff8180: fd fd fd fd fd fd fd fd fa fa fa fa fd fd fd fd
0x0c0e7fff8190: fd fd fd fd fd fa fa fa fa fa fd fd fd fd fd fd
0x0c0e7fff81a0: fd fd fd fd fa fa fa fa fd fd fd fd fd fd fd fd
0x0c0e7fff81b0: fd fd fa fa fa fa fd fd fd fd fd fd fd fd fd fd
Shadow byte legend (one shadow byte represents 8 application bytes):
Addressable: 00
Partially addressable: 01 02 03 04 05 06 07
Heap left redzone: fa
Freed heap region: fd
Stack left redzone: f1
Stack mid redzone: f2
Stack right redzone: f3
Stack after return: f5
Stack use after scope: f8
Global redzone: f9
Global init order: f6
Poisoned by user: f7
Container overflow: fc
Array cookie: ac
Intra object redzone: bb
ASan internal: fe
Left alloca redzone: ca
Right alloca redzone: cb
==3637==ABORTING
```
# A possible explanation
Here is a routine from SQLite 3.24.0's [FTS5](https://www.sqlite.org/fts5.html) that is involved in parsing text:
```c
/*
** Argument p points to a buffer containing utf-8 text that is n bytes in
** size. Return the number of bytes in the nChar character prefix of the
** buffer, or 0 if there are less than nChar characters in total.
*/
static int sqlite3Fts5IndexCharlenToBytelen(
const char *p,
int nByte,
int nChar
){
int n = 0;
int i;
for(i=0; i<nChar; i++){
if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */
if( (unsigned char)p[n++]>=0xc0 ){
while( (p[n] & 0xc0)==0x80 ) n++;
}
}
return n;
}
```
This routine is used when building up prefixes for tokens, which are used to speed up searches for certain prefix lengths. For example, given the table
```sql
CREATE VIRTUAL TABLE search USING fts5 (
title,
prefix='1 2 3'
)
```
and the FTS5 input
```sql
INSERT INTO search (rowid, title) VALUES (42, "foobar xyzzy")
```
FTS5 would generate the following 1-, 2-, and 3-character prefixes:
* `f`, `fo`, and `foo`
* `x`, `xy`, and `xyz`
and store them in the search index.
Let's feed in the UTF-8 sequence まりや (hiragana ma-ri-ya), which is represented by the bytes
```
E3 81 BE E3 82 8A E3 82 84
```
and build 1-, 2-, and 3-character prefixes.
The start of the UTF-8 byte sequence is pointed to by `p`. The byte sequence is 9 bytes long, so `nByte` is `9`. We will be generating 1-, 2-, and 3-character prefixes, so `nChar` will take on the values `1`, `2`, and `3`. Let's focus on what happens when `nChar` is `3`.
Note that `p` is _not_ null-terminated. (This surprised me too.)
----
nByte = `9`, nChar = `3`, p points to `E3 81 BE E3 82 8A E3 82 84`.
```c
int n = 0;
int i;
```
n = `0`.
```c
for(i=0; i<nChar; i++){
```
i = `0`.
```c
if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */
```
`0` >= `9` is false, so continue.
```c
if( (unsigned char)p[n++]>=0xc0 ){
```
Because `p` points to `E3 81 BE E3 82 8A E3 82 84`:
`p[0]` == `0xE3`; `0xE3 >= 0xC0` is true, so execute the consequent. Also, `n` = `1`.
```c
while( (p[n] & 0xc0)==0x80 ) n++;
```
`p[1]` == `0x81`; `0x81 & 0xC0 == 0x80` is true, so increment `n`. `n` = `2`.
`p[2]` == `0xBE`; `0xBE & 0xC0 == 0x80` is true, so increment `n`. `n` = `3`.
`p[3]` == `0xE3`; `0xE3 & 0xC0 == 0x80` is false, so break out of the loop.
```c
}
}
```
`i = 1`.
```c
if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */
```
`3 >= 9` is false, so continue.
```c
if( (unsigned char)p[n++]>=0xc0 ){
```
Because `p` points to `E3 81 BE E3 82 8A E3 82 84`:
`p[3] == 0xE3`; `0xE3 >= 0xC0`, is true, so execute the consequent. Also, `n = 4`.
```c
while( (p[n] & 0xc0)==0x80 ) n++;
```
`p[4] = 0x82`; `0x82 & 0xC0 == 0x80` is true, so increment `n`. `n = 5`.
`p[5] = 0x8A`; `0x8A & 0xC0 == 0x80` is true, so increment `n`. `n = 6`.
`p[6] = 0xE3`; `0xE3 & 0xC0 == 0x80` is false, so break out of the loop.
```c
}
}
```
`i = 2`.
```c
if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */
```
`6 >= 9` is false, so continue.
```c
if( (unsigned char)p[n++]>=0xc0 ){
```
Because `p` points to `E3 81 BE E3 82 8A E3 82 84`:
`p[6] = 0xE3`; `0xE3 >= 0xC0` is true, so execute the consequent. Also, `n = 7`.
```c
while( (p[n] & 0xc0)==0x80 ) n++;
```
`p[7] = 0x82`; `0x82 & 0xC0 == 0x80` is true, so increment `n`.
`p[8] = 0x84`; `0x84 & 0xC0 == 0x80` is true, so increment `n`.
`p[9]` is outside the bounds of `p`! Uh-oh.