Download Html Agility Pack from the following link:
http://htmlagilitypack.codeplex.com/
Add a reference of the library to your project
Step 2: download the html web page to parse
Use the following code to download the web page.
Step 3: Parse the downloaded web page for <a href="..."></a> link using Html Agility Pack
private void Download(string url) { WebClient client = new WebClient(); client.DownloadFileCompleted += new AsyncCompletedEventHandler(client_DownloadFileCompleted); client.DownloadFileAsync(new Uri(url), @"c:\temp.html"); } void client_DownloadFileCompleted(object sender, AsyncCompletedEventArgs e) { //do something here }Use the following code in the client_DownloadFileCompleted() to enumerate all links in the web page.
void client_DownloadFileCompleted(object sender, AsyncCompletedEventArgs e) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc.Load("c:\\temp.html", Encoding.UTF8); foreach (HtmlNode link in doc.DocumentNode.SelectNodes("//a/@href")) { HtmlAttribute att = link.Attributes["href"]; string url = att.Value; Console.WriteLine("Url in temp.html: {0}", url); } }
No comments:
Post a Comment