readme etc

2024-11-08 20:22:44 +01:00 · 2021-04-27 15:48:20 +02:00 · 2021-04-27 15:48:20 +02:00 · 30a4d35d58
commit 30a4d35d58
parent 5020e1a3ed
6 changed files with 309 additions and 118 deletions
--- a/README.md
+++ b/README.md
@ -1,18 +1,78 @@
-# youtube-sponsorship-stats
+# sql.js-httpvfs

-![screenshot](screenshot.png)
+sql.js is a light wrapper around SQLite compiled with EMScripten for use in the browser (client-side).

-This tool uses the [SponsorBlock](https://sponsor.ajay.app/) database to show a chart of how much time sponsorships take up in the videos of a specific YouTuber.
+This repo is a fork of and wrapper around sql.js to provide a read-only HTTP-Range-request based virtual file system for SQLite. It allows hosting an SQLite database on a static file hoster and querying that database from the browser without fully downloading it.

-The data is stored in an SQLite database (150MB) and hosted on GitHub Pages. The SQLite engine is compiled as WASM and run directly in the browser. The database pages are fetched on demand from the remote URL by using a virtual filesystem that delegates to AJAX requests. This means that to get the data of one uploader, only around 300kB of data need to be fetched, not the whole database of 150MByte.
+Note that this only works well if your database and indices is structured well.

-## Wait what? You are using GitHub Pages as a database engine? Also, I thought GitHub had a file size limit of 100MB?
-
-Yes
+It also provides a proof-of-concept level implementation of a DOM virtual table that allows interacting (read/write) with the browser DOM directly from within SQLite queries.


-## Building
+## Usage

-1. Compile sqlite wasm: `cd sql.js && yarn && yarn build -j8 EMCC=/usr/lib/emscripten/emcc`
-2. Create the database: `./create_db.sh` (note that you will need the videoData table which is not present in the normal sponsorblock dumps)
-3. Build the website: `yarn dev` or `yarn build`
+
+(optional) First, improve your SQLite database:
+
+```sql
+-- first, add whatever indices you need. Note that here having many and correct indices is even more important than for a normal database.
+pragma journal_mode = delete; -- to be able to actually set page size
+pragma page_size = 1024; -- trade off of number of requests that need to be made vs overhead. 
+vacuum; -- reorganize database and apply changed page size
+```
+
+(optional) Second, split the database into chunks and generate a json config using the [create_db.sh](create_db.sh) script. This is needed if your hoster has a maximum file size. It can also be a good idea generally depending on your CDN since it allows selective caching of the chunks your users actually use and reduces cache eviction.
+
+Finally, use it in TypeScript / JS!
+
+```ts
+import { createDbWorker } from "sql.js-httpvfs"
+
+// sadly there's no good way to package workers and wasm directly so you need a way to get these two URLs from your bundler. The below is the webpack5 way:
+const workerUrl = new URL(
+  "sql.js-httpvfs/dist/sqlite.worker.js",
+  import.meta.url,
+);
+const wasmUrl = new URL(
+  "sql.js-httpvfs/dist/sql-wasm.wasm",
+  import.meta.url,
+);
+// the legacy webpack4 way is something like `import wasmUrl from "file-loader!sql.js-httpvfs/dist/sql-wasm.wasm"`.
+
+// the config is either the url to the create_db script, or a inline configuration:
+const config = {
+  from: "inline",
+  config: {
+    serverMode: "full", // file is just a plain old full sqlite database
+    requestChunkSize: 4096, // the page size of the  sqlite database (by default 4096)
+    url: "/foo/bar/test.sqlite3" // url to the database (relative or full)
+  }
+};
+// or:
+const config = {
+  from: "jsonconfig",
+  configUrl: "/foo/bar/config.json"
+}
+
+const worker = await createDbWorker(
+  [config],
+  workerUrl.toString(), wasmUrl.toString()
+);
+// you can also pass multiple config objects which can then be used as separate database schemas with `ATTACH virtualFilename as schemaname`, where virtualFilename is also set in the config object.
+
+
+// worker.db is a now SQL.js instance except that all functions return Promises.
+
+const result = await worker.db.exec(`select * from table where id = ?`, [123]);
+
+```
+
+
+
+## Inspiration
+
+This project is inspired by:
+
+* https://github.com/lmatteis/torrent-net https://github.com/bittorrent/sqltorrent Torrent VFS for SQLite. In theory even more awesome than a httpvfs, but only works with native SQLite not in the browser (needs extension to use WebTorrent).
+* https://phiresky.github.io/tv-show-ratings/ a project of mine that fetches the backing data from a WebTorrent (and afterwards seeds it). Not SQLite though, just a torrent with a set of hashed file chunks.
+* https://phiresky.github.io/youtube-sponsorship-stats/?uploader=Adam+Ragusea what I originally built sql.js-httpvfs for
--- a/create_db.sh
+++ b/create_db.sh
@ -3,14 +3,21 @@ set -eu
 indb="$1"
 outdir="$2"

+# for chunked mode, we need to know the database size in bytes beforehand
 bytes="$(stat --printf="%s" "$indb")"
-serverChunkSize=$((50 * 1024 * 1024))
+# set chunk size to 10MiB (needs to be a multiple of the `pragma page_size`!)
+serverChunkSize=$((10 * 1024 * 1024))
 suffixLength=3
 rm -f "$outdir/db.sqlite3"*
 split "$indb" --bytes=$serverChunkSize "$outdir/db.sqlite3." --suffix-length=$suffixLength --numeric-suffixes
+
+# set request chunk size to match page size
 requestChunkSize="$(sqlite3 "$indb" 'pragma page_size')"
+
+# write a json config
 echo '
 {
+    "serverMode": "chunked",
    "requestChunkSize": '$requestChunkSize',
    "databaseLengthBytes": '$bytes',
    "serverChunkSize": '$serverChunkSize',
--- a/src/db.ts
+++ b/src/db.ts
@ -23,42 +23,26 @@ Comlink.transferHandlers.set("WORKERSQLPROXIES", {
  },
 });
 export type SqliteWorker = Comlink.Remote<SqliteComlinkMod>;
-export interface WorkerHttpvfsDatabase
-  extends Comlink.Remote<LazyHttpDatabase> {
+export interface WorkerHttpvfs {
+  db: Comlink.Remote<LazyHttpDatabase>;
  worker: Comlink.Remote<SqliteComlinkMod>;
-  config: SplitFileConfig;
+  configs: SplitFileConfig[];
 }
 export async function createDbWorker(
-  databaseConfigUrl: string,
+  configs: SplitFileConfig[],
  workerUrl: string,
-  wasmUrl: string,
-): Promise<WorkerHttpvfsDatabase> {
+  wasmUrl: string
+): Promise<WorkerHttpvfs> {
  const worker: Worker = new Worker(workerUrl);
  const sqlite = Comlink.wrap<SqliteComlinkMod>(worker);

-  const configUrl = new URL(databaseConfigUrl, location.href);
-  const req = await fetch(configUrl.toString());
-
-  if (!req.ok)
-    throw Error(
-      `Could not load httpvfs config: ${req.status}: ${await req.text()}`
-    );
-  const config: SplitFileConfig = await req.json();
-
-  const db = ((await sqlite.SplitFileHttpDatabase(wasmUrl, {
-    ...config,
-    urlPrefix: new URL(config.urlPrefix, configUrl).toString(),
-  })) as unknown) as WorkerHttpvfsDatabase;
-  db.worker = sqlite;
-  const pageSizeResp = await db.exec("pragma page_size");
-  const pageSize = pageSizeResp[0].values[0][0];
-  if (pageSize !== config.requestChunkSize)
-    console.warn(
-      `Chunk size does not match page size: pragma page_size = ${pageSize} but chunkSize = ${config.requestChunkSize}`
-    );
+  const db = ((await sqlite.SplitFileHttpDatabase(
+    wasmUrl,
+    configs
+  )) as unknown) as Comlink.Remote<LazyHttpDatabase>;

  worker.addEventListener("message", handleAsyncRequestFromWorkerThread);
-  return db;
+  return { db, worker: sqlite, configs };
 }

 async function handleAsyncRequestFromWorkerThread(ev: MessageEvent) {
--- a/src/index.ts
+++ b/src/index.ts
@ -1 +1,3 @@
 export * from "./db";
+export type { SqliteStats } from "./sqlite.worker";
+export type { PageReadLog } from "./lazyFile";
--- a/src/lazyFile.ts
+++ b/src/lazyFile.ts
@ -15,24 +15,32 @@ export type LazyFileConfig = {
  fileLength?: number;
  requestChunkSize: number;
 };
+export type PageReadLog = {
+  pageno: number;
+  // if page was already loaded
+  wasCached: boolean;
+  // how many pages were prefetched
+  prefetch: number;
+};

 // Lazy chunked Uint8Array (implements get and length from Uint8Array). Actual getting is abstracted away for eventual reuse.
-class LazyUint8Array {
+export class LazyUint8Array {
  serverChecked = false;
  chunks: Uint8Array[] = []; // Loaded chunks. Index is the chunk number
  totalFetchedBytes = 0;
  totalRequests = 0;
+  readPages: PageReadLog[] = [];
  _length?: number;

  lastChunk = 0;
  speed = 1;
  _chunkSize: number;
  rangeMapper: RangeMapper;
-  maxSpeed: number
+  maxSpeed: number;

  constructor(config: LazyFileConfig) {
    this._chunkSize = config.requestChunkSize;
-    this.maxSpeed = 1024 * 1024 / this._chunkSize; // max 1MiB at once
+    this.maxSpeed = (5 * 1024 * 1024) / this._chunkSize; // max 5MiB at once
    this.rangeMapper = config.rangeMapper;
    if (config.fileLength) {
      this._length = config.fileLength;
@ -46,24 +54,33 @@ class LazyUint8Array {
    var chunkNum = (idx / this.chunkSize) | 0;
    return this.getter(chunkNum)[chunkOffset];
  }
-  getter(chunkNum: number) {
-    const start = chunkNum * this.chunkSize;
-
-    if (typeof this.chunks[chunkNum] === "undefined") {
+  lastGet = -1;
+  getter(wantedChunkNum: number) {
+    let wasCached = true;
+    if (typeof this.chunks[wantedChunkNum] === "undefined") {
+      wasCached = false;
      // double the fetching chunk size if the wanted chunk would be within the next fetch request
-      if (chunkNum >= this.lastChunk + 1 && chunkNum <= this.lastChunk + this.speed * 2) {
+      const wouldStartChunkNum = this.lastChunk + 1;
+      let fetchStartChunkNum;
+      if (
+        wantedChunkNum >= wouldStartChunkNum &&
+        wantedChunkNum < wouldStartChunkNum + this.speed * 2
+      ) {
+        fetchStartChunkNum = wouldStartChunkNum;
        this.speed = Math.min(this.maxSpeed, this.speed * 2);
      } else {
+        fetchStartChunkNum = wantedChunkNum;
        this.speed = 1;
      }
      const chunksToFetch = this.speed;
-      let endByte = (chunkNum + chunksToFetch) * this.chunkSize - 1; // including this byte
+      const startByte = fetchStartChunkNum * this.chunkSize;
+      let endByte = (fetchStartChunkNum + chunksToFetch) * this.chunkSize - 1; // including this byte
      endByte = Math.min(endByte, this.length - 1); // if datalength-1 is selected, this is the last block

-      this.lastChunk = chunkNum + chunksToFetch - 1;
-      const buf = this.doXHR(start, endByte);
+      this.lastChunk = fetchStartChunkNum + chunksToFetch - 1;
+      const buf = this.doXHR(startByte, endByte);
      for (let i = 0; i < chunksToFetch; i++) {
-        const curChunk = chunkNum + i;
+        const curChunk = fetchStartChunkNum + i;
        if (i * this.chunkSize >= buf.byteLength) break; // past end of file
        const curSize =
          (i + i) * this.chunkSize > buf.byteLength
@ -77,9 +94,18 @@ class LazyUint8Array {
        );
      }
    }
-    if (typeof this.chunks[chunkNum] === "undefined")
-      throw new Error("doXHR failed!");
-    return this.chunks[chunkNum];
+    if (typeof this.chunks[wantedChunkNum] === "undefined")
+      throw new Error("doXHR failed (bug)!");
+    const boring = this.lastGet == wantedChunkNum;
+    if (!boring) {
+      this.lastGet = wantedChunkNum;
+      this.readPages.push({
+        pageno: wantedChunkNum,
+        wasCached,
+        prefetch: wasCached ? 0 : this.speed - 1,
+      });
+    }
+    return this.chunks[wantedChunkNum];
  }
  checkServer() {
    // Find length
@ -91,13 +117,17 @@ class LazyUint8Array {
      throw new Error("Couldn't load " + url + ". Status: " + xhr.status);
    var datalength = Number(xhr.getResponseHeader("Content-length"));

-    console.log("hEADERS", xhr.getAllResponseHeaders());
    var hasByteServing = xhr.getResponseHeader("Accept-Ranges") === "bytes";
    var usesGzip = xhr.getResponseHeader("Content-Encoding") === "gzip";

-    if (!hasByteServing) throw Error("server does not support byte serving");
+    if (!hasByteServing) {
+      const msg = "server does not support byte serving (`Accept-Ranges: bytes` header missing), or your database is hosted on CORS and the server d";
+      console.error(msg, "seen response headers", xhr.getAllResponseHeaders());
+      // throw Error(msg);
+    }

    if (usesGzip || !datalength) {
+      console.error("response headers", xhr.getAllResponseHeaders());
      throw Error("server uses gzip or doesn't have length");
    }

@ -195,7 +225,7 @@ export function createLazyFile(
  });
  // use a custom read function
  stream_ops.read = function stream_ops_read(
-    stream: {node: {contents: LazyUint8Array}},
+    stream: { node: { contents: LazyUint8Array } },
    buffer: Uint8Array,
    offset: number,
    length: number,
@ -205,22 +235,15 @@ export function createLazyFile(
    console.log(
      `[fs: ${length / 1024} KiB read request offset @ ${position / 1024} KiB `
    );
-    var contents = stream.node.contents;
+    const contents = stream.node.contents;
    if (position >= contents.length) return 0;
-    var size = Math.min(contents.length - position, length);
-    /*if (contents.slice) {
-      throw Error('impossible')
-      // normal array
-      for (var i = 0; i < size; i++) {
-        buffer[offset + i] = contents[position + i];
-      }
-    } else {*/
-      // TODO: optimize this to copy whole chunks at once
-      for (var i = 0; i < size; i++) {
-        // LazyUint8Array from sync binary XHR
-        buffer[offset + i] = contents.get(position + i)!;
-      }
-   // }
+    const size = Math.min(contents.length - position, length);
+
+    // TODO: optimize this to copy whole chunks at once
+    for (let i = 0; i < size; i++) {
+      // LazyUint8Array from sync binary XHR
+      buffer[offset + i] = contents.get(position + i)!;
+    }
    return size;
  };
  node.stream_ops = stream_ops;
--- a/src/sqlite.worker.ts
+++ b/src/sqlite.worker.ts
@ -3,19 +3,19 @@
 import * as Comlink from "comlink";
 import initSqlJs from "../sql.js/dist/sql-wasm.js";
 import wasmUrl from "../sql.js/dist/sql-wasm.wasm";
-import { createLazyFile, RangeMapper } from "./lazyFile";
+import { createLazyFile, LazyUint8Array, PageReadLog, RangeMapper } from "./lazyFile";
 import { Database, QueryExecResult } from "sql.js";
 import { SeriesVtab, sqlite3_module, SqljsEmscriptenModuleType } from "./vtab";

 wasmUrl;

-
 // https://gist.github.com/frankier/4bbc85f65ad3311ca5134fbc744db711
 function initTransferHandlers(sql: typeof import("sql.js")) {
  Comlink.transferHandlers.set("WORKERSQLPROXIES", {
    canHandle: (obj): obj is unknown => {
      let isDB = obj instanceof sql.Database;
-      let hasDB = obj && (obj as any).db && (obj as any).db instanceof sql.Database; // prepared statements
+      let hasDB =
+        obj && (obj as any).db && (obj as any).db instanceof sql.Database; // prepared statements
      return isDB || hasDB;
    },
    serialize(obj) {
@ -47,71 +47,186 @@ export function toObjects<T>(res: QueryExecResult[]): T[] {
  });
 }

-
-export type SplitFileConfig = {
-  lastUpdated: number;
-  urlPrefix: string;
-  serverChunkSize: number;
-  databaseLengthBytes: number;
-  requestChunkSize: number;
+export type SplitFileConfig =
+  | SplitFileConfigPure
+  | {
+      virtualFilename?: string;
+      from: "jsonconfig";
+      configUrl: string;
+    };
+export type SplitFileConfigPure = {
+  virtualFilename?: string;
+  from: "inline";
+  config: SplitFileConfigInner;
 };
+export type SplitFileConfigInner = {
+  requestChunkSize: number;
+} & (
+  | {
+      serverMode: "chunked";
+      urlPrefix: string;
+      serverChunkSize: number;
+      databaseLengthBytes: number;
+    }
+  | {
+      serverMode: "full";
+      url: string;
+    }
+);
 export interface LazyHttpDatabase extends Database {
-  lazyFile: any
-  filename: string
-  query: <T = any>(query: string, ...params: any[]) => T[]
-  create_vtab: (cons: {new(sqljs: SqljsEmscriptenModuleType, db: Database): sqlite3_module}) => void
+  lazyFiles: Map<string, { contents: LazyUint8Array }>;
+  filename: string;
+  query: <T = any>(query: string, ...params: any[]) => T[];
+  create_vtab: (cons: {
+    new (sqljs: SqljsEmscriptenModuleType, db: Database): sqlite3_module;
+  }) => void;
+}
+export type SqliteStats = {
+  filename: string;
+  totalBytes: number;
+  totalFetchedBytes: number;
+  totalRequests: number;
+};
+
+async function fetchConfigs(
+  configsOrUrls: SplitFileConfig[]
+): Promise<SplitFileConfigPure[]> {
+  const configs = configsOrUrls.map(async (config) => {
+    if (config.from === "jsonconfig") {
+      const configUrl = new URL(config.configUrl, location.href);
+      const req = await fetch(configUrl.toString());
+
+      if (!req.ok) {
+        console.error("httpvfs config error", await req.text());
+        throw Error(
+          `Could not load httpvfs config: ${req.status}: ${req.statusText}`
+        );
+      }
+      const configOut: SplitFileConfigInner = await req.json();
+      return {
+        from: "inline",
+        // resolve url relative to config file
+        config:
+          configOut.serverMode === "chunked"
+            ? {
+                ...configOut,
+                urlPrefix: new URL(configOut.urlPrefix, configUrl).toString(),
+              }
+            : {
+                ...configOut,
+                url: new URL(configOut.url, configUrl).toString(),
+              },
+        virtualFilename: config.virtualFilename,
+      } as SplitFileConfigPure;
+    } else {
+      return config;
+    }
+  });
+  return Promise.all(configs);
 }
 const mod = {
  db: null as null | LazyHttpDatabase,
+  inited: false,
  sqljs: null as null | Promise<any>,
-  async SplitFileHttpDatabase(wasmUrl: string, p: SplitFileConfig): Promise<Database> {
-    if (this.db) throw Error(`sorry, only one db is supported right now`);
+  async SplitFileHttpDatabase(
+    wasmUrl: string,
+    configs: SplitFileConfig[],
+    mainVirtualFilename?: string
+  ): Promise<LazyHttpDatabase> {
+    if (this.inited) throw Error(`sorry, only one db is supported right now`);
+    this.inited = true;
    if (!this.sqljs) {
      this.sqljs = init(wasmUrl);
    }
    const sql = await this.sqljs;
-    console.log("constructing url database");
-    const rangeMapper: RangeMapper = (from: number, to: number) => {
-      const serverChunkId = (from / p.serverChunkSize) | 0;
-      const serverFrom = from % p.serverChunkSize;
-      const serverTo = serverFrom + (to - from);
-      return {
-        url: p.urlPrefix + String(serverChunkId).padStart(3, "0"),
-        fromByte: serverFrom,
-        toByte: serverTo,
-      };
-    };
+    const lazyFiles = new Map();
+    const hydratedConfigs = await fetchConfigs(configs);
+    let mainFileConfig;
+    for (const { config, virtualFilename } of hydratedConfigs) {
+      const id =
+        config.serverMode === "chunked" ? config.urlPrefix : config.url;
+      console.log("constructing url database", id);
+      let rangeMapper: RangeMapper;
+      if (config.serverMode == "chunked") {
+        rangeMapper = (from: number, to: number) => {
+          const serverChunkId = (from / config.serverChunkSize) | 0;
+          const serverFrom = from % config.serverChunkSize;
+          const serverTo = serverFrom + (to - from);
+          return {
+            url: config.urlPrefix + String(serverChunkId).padStart(3, "0"),
+            fromByte: serverFrom,
+            toByte: serverTo,
+          };
+        };
+      } else {
+        rangeMapper = (fromByte, toByte) => ({
+          url: config.url,
+          fromByte,
+          toByte,
+        });
+      }

-    const filename = p.urlPrefix.replace(/\//g, "_");
-    console.log("filename", filename);
-    const lazyFile = createLazyFile(sql.FS, "/", filename, true, true, {
-      rangeMapper,
-      requestChunkSize: p.requestChunkSize,
-      fileLength: p.databaseLengthBytes,
-    });
+      const filename = virtualFilename || id.replace(/\//g, "_");

-    this.db = new sql.CustomDatabase(filename);
-    this.db!.lazyFile = lazyFile;
-    this.db!.create_vtab(SeriesVtab);
-    this.db!.query = (...args) => toObjects(this.db!.exec(...args));
+      if (!mainVirtualFilename) {
+        mainVirtualFilename = filename;
+        mainFileConfig = config
+      }
+      console.log("filename", filename);
+      console.log("constructing url database", id, "filename", filename);
+      const lazyFile = createLazyFile(sql.FS, "/", filename, true, true, {
+        rangeMapper,
+        requestChunkSize: config.requestChunkSize,
+        fileLength:
+          config.serverMode === "chunked"
+            ? config.databaseLengthBytes
+            : undefined,
+      });
+      lazyFiles.set(filename, lazyFile);
+    }

+    this.db = new sql.CustomDatabase(mainVirtualFilename) as LazyHttpDatabase;
+    if (mainFileConfig) {
+      // verify page size and disable cache (since we hold everything in memory anyways)
+      const pageSizeResp = await this.db.exec("pragma page_size; pragma cache_size=0");
+      const pageSize = pageSizeResp[0].values[0][0];
+      if (pageSize !== mainFileConfig.requestChunkSize)
+        console.warn(
+          `Chunk size does not match page size: pragma page_size = ${pageSize} but chunkSize = ${mainFileConfig.requestChunkSize}`
+        );
+    }
+  
+    this.db.lazyFiles = lazyFiles;
+    this.db.create_vtab(SeriesVtab);
+    this.db.query = (...args) => toObjects(this.db!.exec(...args));
    return this.db!;
  },
-  async getStats() {
+  getResetAccessedPages(virtualFilename?: string): PageReadLog[] {
+    if (!this.db) return [];
+    const lazyFile = this.db.lazyFiles.get(virtualFilename || this.db.filename);
+    if (!lazyFile) throw Error("unknown lazy file");
+    const pages = [...lazyFile.contents.readPages];
+    lazyFile.contents.readPages = [];
+    return pages;
+  },
+  getStats(virtualFilename?: string): SqliteStats | null {
    const db = this.db;
    if (!db) return null;
-    return {
+    const lazyFile = db.lazyFiles.get(virtualFilename || db.filename);
+    if (!lazyFile) throw Error("unknown lazy file");
+    const res = {
      filename: db.filename,
-      totalBytes: db.lazyFile.contents.length,
-      totalFetchedBytes: db.lazyFile.contents.totalFetchedBytes,
-      totalRequests: db.lazyFile.contents.totalRequests,
+      totalBytes: lazyFile.contents.length,
+      totalFetchedBytes: lazyFile.contents.totalFetchedBytes,
+      totalRequests: lazyFile.contents.totalRequests,
    };
+    return res;
  },
  async evalCode(code: string) {
    return await eval(`(async function (db) {
      ${code}
    })`)(this.db);
-  }
+  },
 };
 export type SqliteComlinkMod = typeof mod;
 Comlink.expose(mod);