The Web Crawler problem on LeetCode asks you to crawl web pages starting from a given startUrl
, collecting all URLs that belong to the same hostname as the startUrl
. You are given a HtmlParser
interface that can fetch all URLs from a given web page. Your task is to implement a function that returns a list of all URLs that are reachable from the startUrl
and have the same hostname, using the HtmlParser
interface. The crawler should not visit the same page more than once.
http://hostname/path
.startUrl
should be included in the result.At first glance, this problem resembles a classic graph traversal, where each web page is a node and links between pages are edges. The main challenge is to traverse only the pages that share the same hostname as the starting page, and to avoid visiting the same page multiple times (preventing cycles or infinite loops).
A brute-force approach would be to visit every link from every page, regardless of hostname, but this would not satisfy the constraints. Instead, we need to:
This leads us to consider using a queue (for BFS) or a stack (for DFS), along with a set to remember visited URLs.
Let's break down the steps to solve this problem:
startUrl
to get its hostname. Only URLs with this hostname will be crawled.HtmlParser.getUrls(url)
to fetch all linked URLs from the current page.We use a set for visited URLs because lookups and insertions are O(1). The traversal can be either BFS or DFS; BFS is often preferred for web crawling to avoid deep recursion and stack overflows.
Suppose startUrl = "http://leetcode.com/faq/"
and the web looks like this:
http://leetcode.com/faq/
links to http://leetcode.com/about/
and http://google.com/
http://leetcode.com/about/
links to http://leetcode.com/faq/
(cycle!)http://google.com/
links to nothingSteps:
http://leetcode.com/faq/
. Hostname is leetcode.com
.http://leetcode.com/faq/
, add to result.http://leetcode.com/about/
(same hostname), http://google.com/
(different hostname).http://leetcode.com/about/
to queue (since it hasn't been visited and matches hostname).http://leetcode.com/about/
, add to result.http://leetcode.com/faq/
(already visited, skip).http://leetcode.com/faq/
, http://leetcode.com/about/
]# """
# This is the HtmlParser's interface.
# You should not implement it, or speculate about its implementation
# class HtmlParser:
# def getUrls(self, url: str) -> List[str]:
# """
from urllib.parse import urlparse
from collections import deque
class Solution:
def crawl(self, startUrl: str, htmlParser: 'HtmlParser') -> List[str]:
hostname = urlparse(startUrl).hostname
visited = set()
queue = deque([startUrl])
visited.add(startUrl)
while queue:
url = queue.popleft()
for next_url in htmlParser.getUrls(url):
if urlparse(next_url).hostname == hostname and next_url not in visited:
visited.add(next_url)
queue.append(next_url)
return list(visited)
// /**
// * // This is the HtmlParser's interface.
// * // You should not implement it, or speculate about its implementation
// * class HtmlParser {
// * public:
// * vector<string> getUrls(string url);
// * };
// */
#include <string>
#include <vector>
#include <unordered_set>
#include <queue>
using namespace std;
class Solution {
public:
vector<string> crawl(string startUrl, HtmlParser htmlParser) {
string hostname = getHostname(startUrl);
unordered_set<string> visited;
queue<string> q;
visited.insert(startUrl);
q.push(startUrl);
while (!q.empty()) {
string url = q.front();
q.pop();
for (string nextUrl : htmlParser.getUrls(url)) {
if (getHostname(nextUrl) == hostname && visited.find(nextUrl) == visited.end()) {
visited.insert(nextUrl);
q.push(nextUrl);
}
}
}
return vector<string>(visited.begin(), visited.end());
}
private:
string getHostname(const string& url) {
// Find "http://" (7 chars), then next "/" after that
int start = url.find("://") + 3;
int end = url.find("/", start);
if (end == string::npos) return url.substr(start);
return url.substr(start, end - start);
}
};
/*
// This is the HtmlParser's interface.
// You should not implement it, or speculate about its implementation
// interface HtmlParser {
// public List<String> getUrls(String url) {}
// }
*/
import java.util.*;
public class Solution {
public List<String> crawl(String startUrl, HtmlParser htmlParser) {
String hostname = getHostname(startUrl);
Set<String> visited = new HashSet<>();
Queue<String> queue = new LinkedList<>();
visited.add(startUrl);
queue.offer(startUrl);
while (!queue.isEmpty()) {
String url = queue.poll();
for (String nextUrl : htmlParser.getUrls(url)) {
if (getHostname(nextUrl).equals(hostname) && !visited.contains(nextUrl)) {
visited.add(nextUrl);
queue.offer(nextUrl);
}
}
}
return new ArrayList<>(visited);
}
private String getHostname(String url) {
// Find "http://" (7 chars), then next "/" after that
int start = url.indexOf("://") + 3;
int end = url.indexOf("/", start);
if (end == -1) return url.substring(start);
return url.substring(start, end);
}
}
/**
* // This is the HtmlParser's interface.
* // You should not implement it, or speculate about its implementation
* function HtmlParser() {
* @param {string} url
* @return {string[]}
* this.getUrls = function(url) {
* ...
* };
* }
*/
/**
* @param {string} startUrl
* @param {HtmlParser} htmlParser
* @return {string[]}
*/
var crawl = function(startUrl, htmlParser) {
function getHostname(url) {
let start = url.indexOf("://") + 3;
let end = url.indexOf("/", start);
return end === -1 ? url.substring(start) : url.substring(start, end);
}
let hostname = getHostname(startUrl);
let visited = new Set();
let queue = [startUrl];
visited.add(startUrl);
while (queue.length > 0) {
let url = queue.shift();
for (let nextUrl of htmlParser.getUrls(url)) {
if (getHostname(nextUrl) === hostname && !visited.has(nextUrl)) {
visited.add(nextUrl);
queue.push(nextUrl);
}
}
}
return Array.from(visited);
};
Brute-force Approach: If you crawl every URL regardless of hostname and without tracking visits, you could revisit the same pages infinitely (due to cycles), leading to unbounded time and space.
Optimized Approach (Current Solution):
startUrl
. Each URL is visited and processed at most once.
visited
set and the queue used for BFS.
The Web Crawler problem is a real-world application of graph traversal techniques, with the key twist being the restriction to a single hostname and the need to avoid cycles. By using BFS (or DFS), a set for visited URLs, and careful hostname filtering, we efficiently and safely crawl all relevant pages. The approach is elegant because it combines familiar graph algorithms with practical web constraints, ensuring correctness and efficiency.