deterministicGrouping.js 7 KB

Raw Blame History Permalink

"use strict";

// Simulations show these probabilities for a single change
// 93.1% that one group is invalidated
// 4.8% that two groups are invalidated
// 1.1% that 3 groups are invalidated
// 0.1% that 4 or more groups are invalidated
//
// And these for removing/adding 10 lexically adjacent files
// 64.5% that one group is invalidated
// 24.8% that two groups are invalidated
// 7.8% that 3 groups are invalidated
// 2.7% that 4 or more groups are invalidated
//
// And these for removing/adding 3 random files
// 0% that one group is invalidated
// 3.7% that two groups are invalidated
// 80.8% that 3 groups are invalidated
// 12.3% that 4 groups are invalidated
// 3.2% that 5 or more groups are invalidated

/**
 *
 * @param {string} a key
 * @param {string} b key
 * @returns {number} the similarity as number
 */
const similarity = (a, b) => {
	const l = Math.min(a.length, b.length);
	let dist = 0;
	for (let i = 0; i < l; i++) {
		const ca = a.charCodeAt(i);
		const cb = b.charCodeAt(i);
		dist += Math.max(0, 10 - Math.abs(ca - cb));
	}
	return dist;
};

/**
 * @param {string} a key
 * @param {string} b key
 * @returns {string} the common part and a single char for the difference
 */
const getName = (a, b) => {
	const l = Math.min(a.length, b.length);
	let r = "";
	for (let i = 0; i < l; i++) {
		const ca = a.charAt(i);
		const cb = b.charAt(i);
		r += ca;
		if (ca === cb) {
			continue;
		}
		return r;
	}
	return a;
};

/**
 * @template T
 */
class Node {
	/**
	 * @param {T} item item
	 * @param {string} key key
	 * @param {number} size size
	 */
	constructor(item, key, size) {
		this.item = item;
		this.key = key;
		this.size = size;
	}
}

/**
 * @template T
 */
class Group {
	/**
	 * @param {Node<T>[]} nodes nodes
	 * @param {number[]} similarities similarities between the nodes (length = nodes.length - 1)
	 */
	constructor(nodes, similarities) {
		this.nodes = nodes;
		this.similarities = similarities;
		this.size = nodes.reduce((size, node) => size + node.size, 0);
		/** @type {string} */
		this.key = undefined;
	}
}

/**
 * @template T
 * @typedef {Object} GroupedItems<T>
 * @property {string} key
 * @property {T[]} items
 * @property {number} size
 */

/**
 * @template T
 * @typedef {Object} Options
 * @property {number} maxSize maximum size of a group
 * @property {number} minSize minimum size of a group (preferred over maximum size)
 * @property {Iterable<T>} items a list of items
 * @property {function(T): number} getSize function to get size of an item
 * @property {function(T): string} getKey function to get the key of an item
 */

/**
 * @template T
 * @param {Options<T>} options options object
 * @returns {GroupedItems<T>[]} grouped items
 */
module.exports = ({ maxSize, minSize, items, getSize, getKey }) => {
	/** @type {Group<T>[]} */
	const result = [];

	const nodes = Array.from(
		items,
		item => new Node(item, getKey(item), getSize(item))
	);

	/** @type {Node<T>[]} */
	const initialNodes = [];

	// lexically ordering of keys
	nodes.sort((a, b) => {
		if (a.key < b.key) return -1;
		if (a.key > b.key) return 1;
		return 0;
	});

	// return nodes bigger than maxSize directly as group
	for (const node of nodes) {
		if (node.size >= maxSize) {
			result.push(new Group([node], []));
		} else {
			initialNodes.push(node);
		}
	}

	if (initialNodes.length > 0) {
		// calculate similarities between lexically adjacent nodes
		/** @type {number[]} */
		const similarities = [];
		for (let i = 1; i < initialNodes.length; i++) {
			const a = initialNodes[i - 1];
			const b = initialNodes[i];
			similarities.push(similarity(a.key, b.key));
		}

		const initialGroup = new Group(initialNodes, similarities);

		if (initialGroup.size < minSize) {
			// We hit an edgecase where the working set is already smaller than minSize
			// We merge it with the smallest result node to keep minSize intact
			if (result.length > 0) {
				const smallestGroup = result.reduce((min, group) =>
					min.size > group.size ? group : min
				);
				for (const node of initialGroup.nodes) smallestGroup.nodes.push(node);
				smallestGroup.nodes.sort((a, b) => {
					if (a.key < b.key) return -1;
					if (a.key > b.key) return 1;
					return 0;
				});
			} else {
				// There are no other nodes
				// We use all nodes and have to accept that it's smaller than minSize
				result.push(initialGroup);
			}
		} else {
			const queue = [initialGroup];

			while (queue.length) {
				const group = queue.pop();
				// only groups bigger than maxSize need to be splitted
				if (group.size < maxSize) {
					result.push(group);
					continue;
				}

				// find unsplittable area from left and right
				// going minSize from left and right
				// at least one node need to be included otherwise we get stuck
				let left = 0;
				let leftSize = 0;
				while (leftSize <= minSize) {
					leftSize += group.nodes[left].size;
					left++;
				}
				let right = group.nodes.length - 1;
				let rightSize = 0;
				while (rightSize <= minSize) {
					rightSize += group.nodes[right].size;
					right--;
				}

				if (left - 1 > right) {
					// can't split group while holding minSize
					// because minSize is preferred of maxSize we return
					// the group here even while it's too big
					// To avoid this make sure maxSize > minSize * 3
					result.push(group);
					continue;
				}
				if (left <= right) {
					// when there is a area between left and right
					// we look for best split point
					// we split at the minimum similarity
					// here key space is separated the most
					let best = left - 1;
					let bestSimilarity = group.similarities[best];
					for (let i = left; i <= right; i++) {
						const similarity = group.similarities[i];
						if (similarity < bestSimilarity) {
							best = i;
							bestSimilarity = similarity;
						}
					}
					left = best + 1;
					right = best;
				}

				// create two new groups for left and right area
				// and queue them up
				const rightNodes = [group.nodes[right + 1]];
				/** @type {number[]} */
				const rightSimilaries = [];
				for (let i = right + 2; i < group.nodes.length; i++) {
					rightSimilaries.push(group.similarities[i - 1]);
					rightNodes.push(group.nodes[i]);
				}
				queue.push(new Group(rightNodes, rightSimilaries));

				const leftNodes = [group.nodes[0]];
				/** @type {number[]} */
				const leftSimilaries = [];
				for (let i = 1; i < left; i++) {
					leftSimilaries.push(group.similarities[i - 1]);
					leftNodes.push(group.nodes[i]);
				}
				queue.push(new Group(leftNodes, leftSimilaries));
			}
		}
	}

	// lexically ordering
	result.sort((a, b) => {
		if (a.nodes[0].key < b.nodes[0].key) return -1;
		if (a.nodes[0].key > b.nodes[0].key) return 1;
		return 0;
	});

	// give every group a name
	for (let i = 0; i < result.length; i++) {
		const group = result[i];
		const first = group.nodes[0];
		const last = group.nodes[group.nodes.length - 1];
		let name = getName(first.key, last.key);
		group.key = name;
	}

	// return the results
	return result.map(group => {
		/** @type {GroupedItems} */
		return {
			key: group.key,
			items: group.nodes.map(node => node.item),
			size: group.size
		};
	});
};