From d15c1ba5ff52cac1431ce1df7cb30dd24279003e Mon Sep 17 00:00:00 2001 From: Renovate Bot Date: Tue, 6 Apr 2021 23:03:41 +0000 Subject: [PATCH 1/5] :pushpin: deps: Pin dependency @aureooms/js-compare to 2.0.1 --- package.json | 2 +- yarn.lock | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 001d098..b6d3dee 100644 --- a/package.json +++ b/package.json @@ -65,7 +65,7 @@ "dependencies": {}, "devDependencies": { "@aureooms/js-array": "4.0.0", - "@aureooms/js-compare": "^2.0.1", + "@aureooms/js-compare": "2.0.1", "@aureooms/js-functools": "2.0.3", "@aureooms/js-itertools": "5.1.0", "@aureooms/js-memory": "4.0.0", diff --git a/yarn.lock b/yarn.lock index 8a87c42..527970f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -14,7 +14,7 @@ dependencies: "@aureooms/js-error" "^5.0.2" -"@aureooms/js-compare@^2.0.1": +"@aureooms/js-compare@2.0.1": version "2.0.1" resolved "https://registry.yarnpkg.com/@aureooms/js-compare/-/js-compare-2.0.1.tgz#4636ea0736945abbcca38d5bd56a1f405034b2f9" integrity sha512-J+gG1wlwF401ySv0mOYhIIVUuoukvZaRkIwYNH9xZTLhRyI4VxoaFi3k8GA0ebjHqndLGYJS9uEeG3cuQcg+7w== From f85c79342396049c9009e7c064d09d48805356ba Mon Sep 17 00:00:00 2001 From: Renovate Bot Date: Tue, 6 Apr 2021 23:21:59 +0000 Subject: [PATCH 2/5] :arrow_up: deps: Upgrade JamesIves/github-pages-deploy-action action to v4.1.1 --- .github/workflows/gh-pages.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gh-pages.yml b/.github/workflows/gh-pages.yml index 117d42b..ef023da 100644 --- a/.github/workflows/gh-pages.yml +++ b/.github/workflows/gh-pages.yml @@ -17,7 +17,7 @@ jobs: run: npm run build-gh-pages - name: Deploy 🚀 - uses: JamesIves/github-pages-deploy-action@4.1.0 + uses: JamesIves/github-pages-deploy-action@4.1.1 with: branch: gh-pages folder: gh-pages From dfb68b620f018ae78a7611f189d73311c46bc836 Mon Sep 17 00:00:00 2001 From: Renovate Bot Date: Wed, 7 Apr 2021 22:01:48 +0000 Subject: [PATCH 3/5] :arrow_up: deps: Upgrade dependency c8 to v7.7.1 --- package.json | 2 +- yarn.lock | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/package.json b/package.json index b6d3dee..9374769 100644 --- a/package.json +++ b/package.json @@ -79,7 +79,7 @@ "babel-plugin-transform-remove-console": "6.9.4", "babel-plugin-unassert": "3.0.1", "babel-preset-power-assert": "3.0.0", - "c8": "7.7.0", + "c8": "7.7.1", "coveralls": "3.1.0", "esdoc": "1.1.0", "esdoc-inject-script-plugin": "1.0.0", diff --git a/yarn.lock b/yarn.lock index 527970f..03e9091 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2620,10 +2620,10 @@ builtins@^1.0.3: resolved "https://registry.yarnpkg.com/builtins/-/builtins-1.0.3.tgz#cb94faeb61c8696451db36534e1422f94f0aee88" integrity sha1-y5T662HIaWRR2zZTThQi+U8K7og= -c8@7.7.0: - version "7.7.0" - resolved "https://registry.yarnpkg.com/c8/-/c8-7.7.0.tgz#22241fa5a65c8d059e0c18f4eb845e6d8244c643" - integrity sha512-9OoBQBa5FPs7NNcjaH52SfQpLCXsDRwJKPOeQ9K1MyYoMlnfazMx3XHp+inFPxMA5BV6VMWw1uFrV9sao1oBqA== +c8@7.7.1: + version "7.7.1" + resolved "https://registry.yarnpkg.com/c8/-/c8-7.7.1.tgz#442c5e175f47c407d8631e8d82f17260f666ad10" + integrity sha512-OO9KpDGv1iTd/MBNUForJH7vPKt9XnRPWSBKeRJGma4xfTaKBObA0zWAplFpFRuf/qRmATFqGFrzxqDk51LXsw== dependencies: "@bcoe/v8-coverage" "^0.2.3" "@istanbuljs/schema" "^0.1.2" From f6d707d6a763da80370095dcdc98b5a9e3a4f9fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Ooms?= Date: Thu, 8 Apr 2021 22:46:18 +0200 Subject: [PATCH 4/5] :sparkles: feat: First draft for reservoir sampling. Fixes #18. --- package.json | 1 + src/api/reservoir.js | 14 ++++++++ src/index.js | 2 ++ src/kernel/_waterman.js | 77 +++++++++++++++++++++++++++++++++++++++++ test/src/reservoir.js | 38 ++++++++++++++++++++ yarn.lock | 5 +++ 6 files changed, 137 insertions(+) create mode 100644 src/api/reservoir.js create mode 100644 src/kernel/_waterman.js create mode 100644 test/src/reservoir.js diff --git a/package.json b/package.json index 9374769..2bbfd75 100644 --- a/package.json +++ b/package.json @@ -69,6 +69,7 @@ "@aureooms/js-functools": "2.0.3", "@aureooms/js-itertools": "5.1.0", "@aureooms/js-memory": "4.0.0", + "@aureooms/js-red-black-tree": "^9.0.0", "@aureooms/js-type": "1.0.4", "@babel/core": "7.13.14", "@babel/preset-env": "7.13.12", diff --git a/src/api/reservoir.js b/src/api/reservoir.js new file mode 100644 index 0000000..0ccf6a1 --- /dev/null +++ b/src/api/reservoir.js @@ -0,0 +1,14 @@ +import _waterman from '../kernel/_waterman.js'; +import randint from './randint.js'; + +/** + * Reservoir sampling. + * + * @function + * @param {number} k The size of the sample. + * @param {Iterable} iterable The input iterable. + * @param {Array} [output=new Array(k)] The output array. + * @return {Array} The output array. + */ +const reservoir = _waterman(randint); +export default reservoir; diff --git a/src/index.js b/src/index.js index 7365016..e81330d 100644 --- a/src/index.js +++ b/src/index.js @@ -3,6 +3,7 @@ export {default as randfloat} from './api/randfloat.js'; export {default as randint} from './api/randint.js'; export {default as random} from './api/random.js'; export {default as randrange} from './api/randrange.js'; +export {default as reservoir} from './api/reservoir.js'; export {default as sample} from './api/sample.js'; export {default as shuffle} from './api/shuffle.js'; export {default as shuffled} from './api/shuffled.js'; @@ -12,3 +13,4 @@ export {default as _fisheryates_inside_out} from './kernel/_fisheryates_inside_o export {default as _randfloat} from './kernel/_randfloat.js'; export {default as _randint} from './kernel/_randint.js'; export {default as _shuffle} from './kernel/_shuffle.js'; +export {default as _waterman} from './kernel/_waterman.js'; diff --git a/src/kernel/_waterman.js b/src/kernel/_waterman.js new file mode 100644 index 0000000..2269982 --- /dev/null +++ b/src/kernel/_waterman.js @@ -0,0 +1,77 @@ +/** + * Construct a sampling function using Algorithm R due to Alan Waterman (both + * name and attribution are due to Knuth). + * + * @param {Function} randint The randint function. + * @return {Function} The sample function. + */ +const _waterman = (randint) => { + /** + * Samples k items uniformly at random from an iterable of unknown size. + * + * We want each item to have probability k/n of being selected. + * + * The algorithm works as follows: + * 1. We initialize a candidate sample with the first k items. + * 2. For each remaining item i, decide whether to insert it in the + * candidate sample with probability k/i, evicting an item from the + * candidate sample at random, or to discard it immediately (with + * probability 1-k/i), + * + * To prove that the obtained probability of inclusion for each item is correct + * we multiply two probabilities: + * 1. The probability of entering the candidate sample. + * 2. The probability of staying in the candidate sample until the end. + * + * For items 1 to k, probability 1. is 1, and probability 2. is + * (1-1/(k+1))(1-1/(k+2))...(1-1/n) + * = (k/(k+1))((k+1)/(k+2))...((n-1)/n) which telescopes to k/n. + * + * For items i = k+1 to n, where probability 1. is k/i, and probability 2. + * is (1-1/(i+1))(1-1/(i+2))...(1-1/n) + * = (i/(i+1))((i+1)/(i+2))...((n-1)/n) which telescopes to i/n. + * + * NOTE: Could also implement so that it yields after each input item. + * NOTE: One can reduce the expected number of random bits needed by + * avoiding generating any number above k-1: + * - First we branch on whether i < k. + * - Then we generate the random number between 0 and k-1 only if needed. + * + * To decide on the branch, flip a biased coin with parameter p = k/n. + * To do so, flip a fair coin until it differs from the binary + * representation of k/n (0.10110101...). + * The computation can be made efficient by realizing several things: + * - k is fixed and smaller than n (so divmod step can be skipped) + * - k/(n+1) < k/n (so we can avoid recomputing if the biased flip > k/n) + * + * This would reduce the number of necessary random bits from O(n log n) to + * expected O(n). + * + * @param {number} k The size of the sample. + * @param {Iterable} iterable The input iterable. + * @param {Array} [output=new Array(k)] The output array. + * @return {Array} The output array. + */ + const sample = (k, iterable, output = new Array(k)) => { + const it = iterable[Symbol.iterator](); + + let n = 0; + + for (; n < k; ++n) { + const {value, done} = it.next(); + if (done) return output; + output[n] = value; + } + + for (; ; ++n) { + const {value, done} = it.next(); + if (done) return output; + const i = randint(0, n); + if (i < k) output[i] = value; + } + }; + + return sample; +}; + +export default _waterman; diff --git a/test/src/reservoir.js b/test/src/reservoir.js new file mode 100644 index 0000000..0ac62cc --- /dev/null +++ b/test/src/reservoir.js @@ -0,0 +1,38 @@ +import test from 'ava'; +import {range} from '@aureooms/js-itertools'; +import {increasing} from '@aureooms/js-compare'; +import {RedBlackTree} from '@aureooms/js-red-black-tree'; +import {reservoir, _waterman, randint} from '../../src/index.js'; + +const macro = (t, _, reservoir, k, n) => { + const sample = reservoir(k, range(n)); + const source = RedBlackTree.from(increasing, range(n)); + // We cannot use a Set as it would smoosh input duplicates + + console.debug({sample}); + t.is(sample.length, k); + for (const i of range(Math.min(k, n))) t.true(source.remove(sample[i])); + for (const i of range(n, k)) t.true(sample[i] === undefined); +}; + +macro.title = (title, algo, _, k, n) => + title || `[${algo}] reservoir(${k}, range(${n}))`; + +const algorithms = [ + ['Waterman', _waterman(randint)], + ['API', reservoir], +]; + +const params = [ + [0, 10], + [5, 10], + [10, 5], + [10, 10], + [50, 1000], +]; + +for (const [name, algorithm] of algorithms) { + for (const [k, input] of params) { + test(macro, name, algorithm, k, input); + } +} diff --git a/yarn.lock b/yarn.lock index 03e9091..6132585 100644 --- a/yarn.lock +++ b/yarn.lock @@ -42,6 +42,11 @@ resolved "https://registry.yarnpkg.com/@aureooms/js-memory/-/js-memory-4.0.0.tgz#db87dc64b948f672d73b434ebde047b05869712c" integrity sha1-24fcZLlI9nLXO0NOveBHsFhpcSw= +"@aureooms/js-red-black-tree@^9.0.0": + version "9.0.0" + resolved "https://registry.yarnpkg.com/@aureooms/js-red-black-tree/-/js-red-black-tree-9.0.0.tgz#ee006f24af42749546232b2d0baa13910c98f7b2" + integrity sha512-sUtY0HnwQnBUjrfwysKc6H4BJO4O2+NnrUHLqTYJyT1l1VSI+oXGffjjmMJTFpIl4L/4FEZAN0L3BiQxgR1T8g== + "@aureooms/js-type@1.0.4": version "1.0.4" resolved "https://registry.yarnpkg.com/@aureooms/js-type/-/js-type-1.0.4.tgz#7f9de5f5f8506ff9c8958731744b7427b62e92b7" From 5ace169f2d699171b7013f2887a88dc393d695c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Ooms?= Date: Thu, 8 Apr 2021 22:50:48 +0200 Subject: [PATCH 5/5] :hatching_chick: release: Bumping to v3.4.0. --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 2bbfd75..96b664d 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "@aureooms/js-random", "description": "Randomness algorithms for JavaScript", - "version": "3.3.0", + "version": "3.4.0", "license": "AGPL-3.0", "author": "Aurélien Ooms ", "homepage": "https://aureooms.github.io/js-random",