400 views
# Just give me a test program OK. Name this program `test.c`: ```c #include <string.h> #include "sqlite3.h" const char* schema = "" "CREATE VIRTUAL TABLE search USING fts5(" " title," " prefix='1 2 3'," " content='')"; int main(int argc, char** argv) { sqlite3* db; /* Open a database connection */ if (sqlite3_open_v2(":memory:", &db, SQLITE_OPEN_READWRITE, 0) != SQLITE_OK) { return 1; }; /* Create the schema */ if (sqlite3_exec(db, schema, 0, 0, 0) != SQLITE_OK) { return 2; } /* Prepare an INSERT */ const char *sql = "INSERT INTO search (rowid, title) VALUES (?, ?)"; const char *tail; sqlite3_stmt *stmt; if (sqlite3_prepare_v2(db, sql, strlen(sql), &stmt, &tail) != SQLITE_OK) { sqlite3_close_v2(db); return 3; } /* Bind the rowid */ if (sqlite3_bind_int64(stmt, 1, 1000) != SQLITE_OK) { sqlite3_close_v2(db); sqlite3_finalize(stmt); return 4; } /* Bind the title */ const char *title = "まりや"; if (sqlite3_bind_text(stmt, 2, title, strlen(title), 0) != SQLITE_OK) { sqlite3_close_v2(db); sqlite3_finalize(stmt); return 5; } /* Run the insert */ while (sqlite3_step(stmt) != SQLITE_DONE) { } return 0; } ``` Link this against the SQLite amalgamation (go to https://www.sqlite.org/download.html and select the amalgamation option). Turn on ASan and FTS5: ``` gcc -g -fsanitize=address -DSQLITE_ENABLE_FTS5 -Wall test.c sqlite3.c -ldl -lpthread -lm ``` Then run it. You will see something like this: ``` ================================================================= ==3637==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x607000000b58 at pc 0x55f32bfd3d81 bp 0x7ffdc62f58f0 sp 0x7ffdc62f58e0 READ of size 1 at 0x607000000b58 thread T0 #0 0x55f32bfd3d80 in sqlite3Fts5IndexCharlenToBytelen /home/boom/vendor/sqlite3/sqlite3.c:203708 #1 0x55f32bfd3ff7 in sqlite3Fts5IndexWrite /home/boom/vendor/sqlite3/sqlite3.c:203760 #2 0x55f32bfea69e in fts5StorageInsertCallback /home/boom/vendor/sqlite3/sqlite3.c:208084 #3 0x55f32bff22f9 in fts5UnicodeTokenize /home/boom/vendor/sqlite3/sqlite3.c:209323 #4 0x55f32bf9d22b in sqlite3Fts5Tokenize /home/boom/vendor/sqlite3/sqlite3.c:194822 #5 0x55f32bfed454 in sqlite3Fts5StorageIndexInsert /home/boom/vendor/sqlite3/sqlite3.c:208437 #6 0x55f32bfe12b1 in fts5StorageInsert /home/boom/vendor/sqlite3/sqlite3.c:206408 #7 0x55f32bfe1aca in fts5UpdateMethod /home/boom/vendor/sqlite3/sqlite3.c:206511 #8 0x55f32be8b879 in sqlite3VdbeExec /home/boom/vendor/sqlite3/sqlite3.c:88868 #9 0x55f32be6a4b9 in sqlite3Step /home/boom/vendor/sqlite3/sqlite3.c:80302 #10 0x55f32be6abb3 in sqlite3_step /home/boom/vendor/sqlite3/sqlite3.c:80365 #11 0x55f32bdc1b8f in main /home/boom/vendor/sqlite3/test.c:50 #12 0x7fe80d8d6b96 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b96) #13 0x55f32bdc1719 in _start (/home/boom/vendor/sqlite3/a.out+0x23719) 0x607000000b58 is located 0 bytes to the right of 72-byte region [0x607000000b10,0x607000000b58) allocated by thread T0 here: #0 0x7fe80e545b50 in __interceptor_malloc (/usr/lib/x86_64-linux-gnu/libasan.so.4+0xdeb50) #1 0x55f32bdcc134 in sqlite3MemMalloc /home/boom/vendor/sqlite3/sqlite3.c:22262 #2 0x55f32bdcd018 in mallocWithAlarm /home/boom/vendor/sqlite3/sqlite3.c:26094 #3 0x55f32bdcd17c in sqlite3Malloc /home/boom/vendor/sqlite3/sqlite3.c:26124 #4 0x55f32bdcd269 in sqlite3_malloc /home/boom/vendor/sqlite3/sqlite3.c:26142 #5 0x55f32bff1306 in fts5UnicodeCreate /home/boom/vendor/sqlite3/sqlite3.c:209184 #6 0x55f32bfe82a6 in sqlite3Fts5GetTokenizer /home/boom/vendor/sqlite3/sqlite3.c:207530 #7 0x55f32bf9afe9 in fts5ConfigDefaultTokenizer /home/boom/vendor/sqlite3/sqlite3.c:194501 #8 0x55f32bf9c5ac in sqlite3Fts5ConfigParse /home/boom/vendor/sqlite3/sqlite3.c:194700 #9 0x55f32bfdadd6 in fts5InitVtab /home/boom/vendor/sqlite3/sqlite3.c:205324 #10 0x55f32bfdb463 in fts5CreateMethod /home/boom/vendor/sqlite3/sqlite3.c:205390 #11 0x55f32bf3c919 in vtabCallConstructor /home/boom/vendor/sqlite3/sqlite3.c:131129 #12 0x55f32bf3dc2c in sqlite3VtabCallCreate /home/boom/vendor/sqlite3/sqlite3.c:131299 #13 0x55f32be8a071 in sqlite3VdbeExec /home/boom/vendor/sqlite3/sqlite3.c:88544 #14 0x55f32be6a4b9 in sqlite3Step /home/boom/vendor/sqlite3/sqlite3.c:80302 #15 0x55f32be6abb3 in sqlite3_step /home/boom/vendor/sqlite3/sqlite3.c:80365 #16 0x55f32befdebc in sqlite3_exec /home/boom/vendor/sqlite3/sqlite3.c:115729 #17 0x55f32bdc193b in main /home/boom/vendor/sqlite3/test.c:20 #18 0x7fe80d8d6b96 in __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x21b96) SUMMARY: AddressSanitizer: heap-buffer-overflow /home/boom/vendor/sqlite3/sqlite3.c:203708 in sqlite3Fts5IndexCharlenToBytelen Shadow bytes around the buggy address: 0x0c0e7fff8110: fd fd fd fd fd fd fd fd fa fa fa fa fd fd fd fd 0x0c0e7fff8120: fd fd fd fd fd fd fa fa fa fa fd fd fd fd fd fd 0x0c0e7fff8130: fd fd fd fd fa fa fa fa fd fd fd fd fd fd fd fd 0x0c0e7fff8140: fd fd fa fa fa fa fd fd fd fd fd fd fd fd fd fd 0x0c0e7fff8150: fa fa fa fa 00 00 00 00 00 00 00 00 00 fa fa fa =>0x0c0e7fff8160: fa fa 00 00 00 00 00 00 00 00 00[fa]fa fa fa fa 0x0c0e7fff8170: 00 00 00 00 00 00 00 00 00 fa fa fa fa fa fd fd 0x0c0e7fff8180: fd fd fd fd fd fd fd fd fa fa fa fa fd fd fd fd 0x0c0e7fff8190: fd fd fd fd fd fa fa fa fa fa fd fd fd fd fd fd 0x0c0e7fff81a0: fd fd fd fd fa fa fa fa fd fd fd fd fd fd fd fd 0x0c0e7fff81b0: fd fd fa fa fa fa fd fd fd fd fd fd fd fd fd fd Shadow byte legend (one shadow byte represents 8 application bytes): Addressable: 00 Partially addressable: 01 02 03 04 05 06 07 Heap left redzone: fa Freed heap region: fd Stack left redzone: f1 Stack mid redzone: f2 Stack right redzone: f3 Stack after return: f5 Stack use after scope: f8 Global redzone: f9 Global init order: f6 Poisoned by user: f7 Container overflow: fc Array cookie: ac Intra object redzone: bb ASan internal: fe Left alloca redzone: ca Right alloca redzone: cb ==3637==ABORTING ``` # A possible explanation Here is a routine from SQLite 3.24.0's [FTS5](https://www.sqlite.org/fts5.html) that is involved in parsing text: ```c /* ** Argument p points to a buffer containing utf-8 text that is n bytes in ** size. Return the number of bytes in the nChar character prefix of the ** buffer, or 0 if there are less than nChar characters in total. */ static int sqlite3Fts5IndexCharlenToBytelen( const char *p, int nByte, int nChar ){ int n = 0; int i; for(i=0; i<nChar; i++){ if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */ if( (unsigned char)p[n++]>=0xc0 ){ while( (p[n] & 0xc0)==0x80 ) n++; } } return n; } ``` This routine is used when building up prefixes for tokens, which are used to speed up searches for certain prefix lengths. For example, given the table ```sql CREATE VIRTUAL TABLE search USING fts5 ( title, prefix='1 2 3' ) ``` and the FTS5 input ```sql INSERT INTO search (rowid, title) VALUES (42, "foobar xyzzy") ``` FTS5 would generate the following 1-, 2-, and 3-character prefixes: * `f`, `fo`, and `foo` * `x`, `xy`, and `xyz` and store them in the search index. Let's feed in the UTF-8 sequence まりや (hiragana ma-ri-ya), which is represented by the bytes ``` E3 81 BE E3 82 8A E3 82 84 ``` and build 1-, 2-, and 3-character prefixes. The start of the UTF-8 byte sequence is pointed to by `p`. The byte sequence is 9 bytes long, so `nByte` is `9`. We will be generating 1-, 2-, and 3-character prefixes, so `nChar` will take on the values `1`, `2`, and `3`. Let's focus on what happens when `nChar` is `3`. Note that `p` is _not_ null-terminated. (This surprised me too.) ---- nByte = `9`, nChar = `3`, p points to `E3 81 BE E3 82 8A E3 82 84`. ```c int n = 0; int i; ``` n = `0`. ```c for(i=0; i<nChar; i++){ ``` i = `0`. ```c if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */ ``` `0` >= `9` is false, so continue. ```c if( (unsigned char)p[n++]>=0xc0 ){ ``` Because `p` points to `E3 81 BE E3 82 8A E3 82 84`: `p[0]` == `0xE3`; `0xE3 >= 0xC0` is true, so execute the consequent. Also, `n` = `1`. ```c while( (p[n] & 0xc0)==0x80 ) n++; ``` `p[1]` == `0x81`; `0x81 & 0xC0 == 0x80` is true, so increment `n`. `n` = `2`. `p[2]` == `0xBE`; `0xBE & 0xC0 == 0x80` is true, so increment `n`. `n` = `3`. `p[3]` == `0xE3`; `0xE3 & 0xC0 == 0x80` is false, so break out of the loop. ```c } } ``` `i = 1`. ```c if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */ ``` `3 >= 9` is false, so continue. ```c if( (unsigned char)p[n++]>=0xc0 ){ ``` Because `p` points to `E3 81 BE E3 82 8A E3 82 84`: `p[3] == 0xE3`; `0xE3 >= 0xC0`, is true, so execute the consequent. Also, `n = 4`. ```c while( (p[n] & 0xc0)==0x80 ) n++; ``` `p[4] = 0x82`; `0x82 & 0xC0 == 0x80` is true, so increment `n`. `n = 5`. `p[5] = 0x8A`; `0x8A & 0xC0 == 0x80` is true, so increment `n`. `n = 6`. `p[6] = 0xE3`; `0xE3 & 0xC0 == 0x80` is false, so break out of the loop. ```c } } ``` `i = 2`. ```c if( n>=nByte ) return 0; /* Input contains fewer than nChar chars */ ``` `6 >= 9` is false, so continue. ```c if( (unsigned char)p[n++]>=0xc0 ){ ``` Because `p` points to `E3 81 BE E3 82 8A E3 82 84`: `p[6] = 0xE3`; `0xE3 >= 0xC0` is true, so execute the consequent. Also, `n = 7`. ```c while( (p[n] & 0xc0)==0x80 ) n++; ``` `p[7] = 0x82`; `0x82 & 0xC0 == 0x80` is true, so increment `n`. `p[8] = 0x84`; `0x84 & 0xC0 == 0x80` is true, so increment `n`. `p[9]` is outside the bounds of `p`! Uh-oh.