The "Delete Duplicate Folders in System" problem involves simulating a file system's folder structure and identifying duplicate folders. You are given a list of folder paths, where each path is represented as a list of strings (each string is a folder name). Your task is to find all duplicate folders—where two or more folders (including all their subfolders and structure) are identical in content and structure—and delete them from the system. After deletion, return the remaining folder paths in any order.
["a", "b", "c"]
representing the path /a/b/c
.At first glance, it might seem necessary to compare every pair of folders and their subfolders to identify duplicates. However, this brute-force approach would be too slow for large inputs, as it would require checking every possible subtree for equality.
Instead, we need a way to efficiently identify duplicate subtrees. The key insight is that if we can "serialize" the subtree rooted at each folder (i.e., represent its structure and contents as a unique string), then identical subtrees will have the same serialization.
By traversing the folder tree from the leaves up (post-order traversal), we can build the serialization for each subtree. Using a hash map, we can count how many times each serialization appears. If a serialization appears more than once, it means that subtree is duplicated and should be deleted everywhere it occurs.
This approach ensures each subtree is only serialized and compared once, making it efficient. The use of a hash map allows for quick lookup and counting.
Sample Input:
paths = [["a"], ["a","x"], ["a","y"], ["a","x","z"], ["b"], ["b","x"], ["b","y"], ["b","x","z"]]
/a
and /b
have subfolders x
and y
, and x
has subfolder z
in both cases.x/z
is (z)
.x
is (x(z))
.y
is (y)
.a
and b
is (a(x(z))(y))
and (b(x(z))(y))
respectively./a/x
and /b/x
are identical, as are /a/y
and /b/y
./a/x
and /b/x
(and their subtrees) are duplicates, as are /a/y
and /b/y
, mark them for deletion./a
and /b
remain (as their subfolders have been deleted).[["a"], ["b"]]
.The core strategy for deleting duplicate folders is to represent each subtree as a unique string (serialization) and use a hash map to detect duplicates. By post-order traversing the folder tree, we efficiently identify and remove all duplicate subtrees in linear time relative to the number of folders. This method avoids redundant comparisons and leverages hashing for quick detection, making the solution both elegant and performant.
class TrieNode:
def __init__(self):
self.children = dict()
self.name = ""
self.is_deleted = False
class Solution:
def deleteDuplicateFolder(self, paths):
root = TrieNode()
# Build the Trie
for path in paths:
node = root
for name in path:
if name not in node.children:
node.children[name] = TrieNode()
node.children[name].name = name
node = node.children[name]
# Map to count serializations
from collections import defaultdict
serial_count = defaultdict(int)
# Serialization function
def serialize(node):
if not node.children:
return ""
serials = []
for child in sorted(node.children):
serials.append(child + "(" + serialize(node.children[child]) + ")")
serial = "".join(serials)
serial_count[serial] += 1
return serial
serialize(root)
# Mark duplicates for deletion
def mark(node):
if not node.children:
return ""
serials = []
for child in sorted(node.children):
s = child + "(" + mark(node.children[child]) + ")"
serials.append(s)
serial = "".join(serials)
if serial and serial_count[serial] > 1:
node.is_deleted = True
return serial
mark(root)
# Collect remaining paths
res = []
def collect(node, path):
for child in node.children.values():
if not child.is_deleted:
res.append(path + [child.name])
collect(child, path + [child.name])
collect(root, [])
return res
class TrieNode {
public:
unordered_map<string, TrieNode*> children;
string name;
bool is_deleted = false;
TrieNode() {}
};
class Solution {
public:
vector<vector<string>> deleteDuplicateFolder(vector<vector<string>>& paths) {
TrieNode* root = new TrieNode();
// Build Trie
for (auto& path : paths) {
TrieNode* node = root;
for (string& name : path) {
if (!node->children.count(name)) {
node->children[name] = new TrieNode();
node->children[name]->name = name;
}
node = node->children[name];
}
}
unordered_map<string, int> serial_count;
// Serialization
function<string(TrieNode*)> serialize = [&](TrieNode* node) {
if (node->children.empty()) return string("");
vector<string> serials;
for (auto& p : node->children) serials.push_back(p.first + "(" + serialize(p.second) + ")");
sort(serials.begin(), serials.end());
string serial;
for (auto& s : serials) serial += s;
serial_count[serial]++;
return serial;
};
serialize(root);
// Mark duplicates
function<string(TrieNode*)> mark = [&](TrieNode* node) {
if (node->children.empty()) return string("");
vector<string> serials;
for (auto& p : node->children) serials.push_back(p.first + "(" + mark(p.second) + ")");
sort(serials.begin(), serials.end());
string serial;
for (auto& s : serials) serial += s;
if (!serial.empty() && serial_count[serial] > 1) node->is_deleted = true;
return serial;
};
mark(root);
// Collect
vector<vector<string>> res;
function<void(TrieNode*, vector<string>&)> collect = [&](TrieNode* node, vector<string>& path) {
for (auto& p : node->children) {
if (!p.second->is_deleted) {
path.push_back(p.first);
res.push_back(path);
collect(p.second, path);
path.pop_back();
}
}
};
vector<string> path;
collect(root, path);
return res;
}
};
class TrieNode {
Map<String, TrieNode> children = new HashMap<>();
String name = "";
boolean isDeleted = false;
}
class Solution {
public List<List<String>> deleteDuplicateFolder(List<List<String>> paths) {
TrieNode root = new TrieNode();
// Build Trie
for (List<String> path : paths) {
TrieNode node = root;
for (String name : path) {
node.children.putIfAbsent(name, new TrieNode());
node.children.get(name).name = name;
node = node.children.get(name);
}
}
Map<String, Integer> serialCount = new HashMap<>();
// Serialization
serialize(root, serialCount);
// Mark duplicates
mark(root, serialCount);
// Collect
List<List<String>> res = new ArrayList<>();
collect(root, new ArrayList<>(), res);
return res;
}
private String serialize(TrieNode node, Map<String, Integer> serialCount) {
if (node.children.isEmpty()) return "";
List<String> serials = new ArrayList<>();
for (String child : node.children.keySet()) {
serials.add(child + "(" + serialize(node.children.get(child), serialCount) + ")");
}
Collections.sort(serials);
StringBuilder sb = new StringBuilder();
for (String s : serials) sb.append(s);
String serial = sb.toString();
serialCount.put(serial, serialCount.getOrDefault(serial, 0) + 1);
return serial;
}
private String mark(TrieNode node, Map<String, Integer> serialCount) {
if (node.children.isEmpty()) return "";
List<String> serials = new ArrayList<>();
for (String child : node.children.keySet()) {
serials.add(child + "(" + mark(node.children.get(child), serialCount) + ")");
}
Collections.sort(serials);
StringBuilder sb = new StringBuilder();
for (String s : serials) sb.append(s);
String serial = sb.toString();
if (!serial.isEmpty() && serialCount.get(serial) > 1) node.isDeleted = true;
return serial;
}
private void collect(TrieNode node, List<String> path, List<List<String>> res) {
for (Map.Entry<String, TrieNode> entry : node.children.entrySet()) {
if (!entry.getValue().isDeleted) {
path.add(entry.getKey());
res.add(new ArrayList<>(path));
collect(entry.getValue(), path, res);
path.remove(path.size() - 1);
}
}
}
}
class TrieNode {
constructor() {
this.children = {};
this.name = "";
this.isDeleted = false;
}
}
var deleteDuplicateFolder = function(paths) {
const root = new TrieNode();
// Build Trie
for (const path of paths) {
let node = root;
for (const name of path) {
if (!(name in node.children)) {
node.children[name] = new TrieNode();
node.children[name].name = name;
}
node = node.children[name];
}
}
const serialCount = {};
// Serialization
function serialize(node) {
if (Object.keys(node.children).length === 0) return "";
const serials = [];
for (const child of Object.keys(node.children).sort()) {
serials.push(child + "(" + serialize(node.children[child]) + ")");
}
const serial = serials.join("");
serialCount[serial] = (serialCount[serial] || 0) + 1;
return serial;
}
serialize(root);
// Mark duplicates
function mark(node) {
if (Object.keys(node.children).length === 0) return "";
const serials = [];
for (const child of Object.keys(node.children).sort()) {
serials.push(child + "(" + mark(node.children[child]) + ")");
}
const serial = serials.join("");
if (serial && serialCount[serial] > 1) node.isDeleted = true;
return serial;
}
mark(root);
// Collect
const res = [];
function collect(node, path) {
for (const child in node.children) {
if (!node.children[child].isDeleted) {
res.push([...path, child]);
collect(node.children[child], [...path, child]);
}
}
}
collect(root, []);
return res;
};