The Web Crawler Multithreaded problem asks you to implement a web crawler that starts from a given startUrl
and explores all URLs that are reachable from it, but only those URLs that belong to the same hostname as startUrl
. The crawler should use multiple threads to improve performance.
The crawler is provided with a HtmlParser
interface that can fetch all URLs from a page. Your solution should avoid visiting the same URL more than once, and you must ensure that the crawling is thread-safe (i.e., no race conditions or duplicate visits). The result should be a list of all URLs with the same hostname as startUrl
that can be reached from it.
http://news.yahoo.com/news/topics/
startUrl
should be included.
To solve this problem, we need to traverse the web graph starting from startUrl
, visiting each page and collecting its links. Since we only want URLs with the same hostname, we must filter links accordingly.
The simplest approach is a breadth-first search (BFS) or depth-first search (DFS), keeping track of visited URLs to avoid cycles. However, the problem requires us to use multiple threads, which means our visited set and work queue must be shared safely among threads.
The main challenges are:
Here’s how we can approach the problem step by step:
http://
and before the next /
).startUrl
.Queue
in Python or ConcurrentLinkedQueue
in Java) to manage URLs to be crawled.In summary, we use a concurrent BFS with a thread-safe visited set and queue, and multiple threads to parallelize the crawling.
Suppose startUrl = "http://news.yahoo.com/news/topics/"
and HtmlParser.getUrls(startUrl)
returns:
["http://news.yahoo.com/news", "http://news.yahoo.com/news/topics/", "http://news.google.com"]
startUrl
, add it to the visited set and the queue.
startUrl
, fetches its links.
http://news.yahoo.com/news
- same hostname, not visited, add to queue and visited.http://news.yahoo.com/news/topics/
- same hostname, but already visited (it's startUrl
).http://news.google.com
- different hostname, ignore.http://news.yahoo.com/news
, fetches its links (suppose it returns ["http://news.yahoo.com", "http://news.yahoo.com/news/topics/"]
).
http://news.yahoo.com
- same hostname, not visited, add to queue and visited.http://news.yahoo.com/news/topics/
- already visited, skip.http://news.yahoo.com
, fetches its links (suppose it returns []
).
["http://news.yahoo.com/news/topics/", "http://news.yahoo.com/news", "http://news.yahoo.com"]
.
This problem demonstrates how to combine graph traversal (BFS/DFS) with thread-safe programming. The key is to use concurrent data structures for the visited set and the work queue so that multiple threads can safely collaborate. By filtering URLs by hostname and ensuring no duplicates, we efficiently crawl all reachable pages from the startUrl
within the same domain. Using threads speeds up the crawling but requires careful synchronization to avoid race conditions and redundant work.
from threading import Thread, Lock
from queue import Queue
from urllib.parse import urlparse
class Solution:
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
hostname = urlparse(startUrl).hostname
visited = set()
visited_lock = Lock()
q = Queue()
q.put(startUrl)
visited.add(startUrl)
def worker():
while True:
try:
url = q.get(timeout=0.1)
except:
return
for next_url in htmlParser.getUrls(url):
if urlparse(next_url).hostname == hostname:
with visited_lock:
if next_url not in visited:
visited.add(next_url)
q.put(next_url)
q.task_done()
threads = []
for _ in range(8): # You can adjust thread count
t = Thread(target=worker)
t.daemon = True
t.start()
threads.append(t)
q.join()
return list(visited)
#include <vector>
#include <string>
#include <unordered_set>
#include <queue>
#include <mutex>
#include <thread>
#include <regex>
using namespace std;
// This is the HtmlParser's API interface.
// You should not implement it, or speculate about its implementation
class HtmlParser {
public:
vector<string> getUrls(string url);
};
class Solution {
public:
string getHostname(const string& url) {
// Simple parse: "http://hostname/..."
size_t start = url.find("://");
if (start == string::npos) return "";
start += 3;
size_t end = url.find('/', start);
if (end == string::npos) return url.substr(start);
return url.substr(start, end - start);
}
vector<string> crawl(string startUrl, HtmlParser htmlParser) {
string hostname = getHostname(startUrl);
unordered_set<string> visited;
mutex visited_mutex;
queue<string> q;
q.push(startUrl);
visited.insert(startUrl);
auto worker = [&]() {
while (true) {
string url;
{
lock_guard<mutex> lock(visited_mutex);
if (q.empty()) return;
url = q.front();
q.pop();
}
vector<string> urls = htmlParser.getUrls(url);
for (const string& next_url : urls) {
if (getHostname(next_url) == hostname) {
lock_guard<mutex> lock(visited_mutex);
if (visited.count(next_url) == 0) {
visited.insert(next_url);
q.push(next_url);
}
}
}
}
};
vector<thread> threads;
for (int i = 0; i < 8; ++i)
threads.emplace_back(worker);
for (auto& t : threads)
t.join();
vector<string> res(visited.begin(), visited.end());
return res;
}
};
import java.util.*;
import java.util.concurrent.*;
import java.net.*;
class HtmlParser {
public List<String> getUrls(String url) { return new ArrayList<>(); }
}
class Solution {
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
String hostname = getHostname(startUrl);
Set<String> visited = ConcurrentHashMap.newKeySet();
visited.add(startUrl);
Queue<String> queue = new ConcurrentLinkedQueue<>();
queue.add(startUrl);
ExecutorService pool = Executors.newFixedThreadPool(8);
List<Future<?>> futures = new ArrayList<>();
Runnable worker = () -> {
while (true) {
String url = queue.poll();
if (url == null) break;
for (String nextUrl : htmlParser.getUrls(url)) {
if (getHostname(nextUrl).equals(hostname) && visited.add(nextUrl)) {
queue.add(nextUrl);
}
}
}
};
for (int i = 0; i < 8; ++i)
futures.add(pool.submit(worker));
for (Future<?> f : futures) {
try { f.get(); } catch (Exception e) {}
}
pool.shutdown();
return new ArrayList<>(visited);
}
private String getHostname(String url) {
try {
URL u = new URL(url);
return u.getHost();
} catch (Exception e) {
return "";
}
}
}
/**
* // This is the HtmlParser's API interface.
* // function HtmlParser() {
* // @param {string} url
* // @return {string[]}
* // this.getUrls = function(url) {
* // ...
* // };
* // }
*/
/**
* @param {string} startUrl
* @param {HtmlParser} htmlParser
* @return {string[]}
*/
var crawl = function(startUrl, htmlParser) {
function getHostname(url) {
return url.split('/')[2];
}
const hostname = getHostname(startUrl);
const visited = new Set();
visited.add(startUrl);
const queue = [startUrl];
// JavaScript is single-threaded in LeetCode, but we can simulate concurrency with async/await
let idx = 0;
async function bfs() {
while (idx < queue.length) {
const url = queue[idx++];
const urls = htmlParser.getUrls(url);
for (const nextUrl of urls) {
if (getHostname(nextUrl) === hostname && !visited.has(nextUrl)) {
visited.add(nextUrl);
queue.push(nextUrl);
}
}
}
}
// In practice, LeetCode will not run this in parallel, but we can still use async
return Array.from(visited);
};