最近有人討論到這個問題...小弟去找了一個Regular Expression,,還不錯用..
雖然不一定很正確的取出Url,但命中率我覺得很高了...
就用範例來介紹,分享給大家呀..
asp.net(c#)
Default.aspx
<%@ Page Language="C#" AutoEventWireup="true" ValidateRequest="false" CodeBehind="Default.aspx.cs" Inherits="Test._Default" %>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" >
<head id="Head1" runat="server">
<title>利用正規表達式(Regular Expression)取得網頁內容的超連結Url</title>
</head>
<head id="Head1" runat="server">
<title>利用正規表達式(Regular Expression)取得網頁內容的超連結Url</title>
</head>
<body>
<form id="form1" runat="server">
<div>
URL:
<asp:TextBox ID="TextBox1" runat="server" Width="340px">http://tw.yahoo.com</asp:TextBox>
<asp:Button ID="Button1" runat="server" OnClick="Button1_Click" Text="GetHtmlUrl" /><br />
<asp:GridView ID="GridView1" runat="server"></asp:GridView>
</div>
</form>
</body>
</html>
<form id="form1" runat="server">
<div>
URL:
<asp:TextBox ID="TextBox1" runat="server" Width="340px">http://tw.yahoo.com</asp:TextBox>
<asp:Button ID="Button1" runat="server" OnClick="Button1_Click" Text="GetHtmlUrl" /><br />
<asp:GridView ID="GridView1" runat="server"></asp:GridView>
</div>
</form>
</body>
</html>
Default.aspx.cs
using System;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Net;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using System.Text;
using System.Data;
using System.Configuration;
using System.Collections;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Net;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using System.Text;
namespace Test
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
{
public partial class _Default : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
protected void Button1_Click(object sender, EventArgs e)
{
WebClient client = new WebClient();
client.Encoding = Encoding.UTF8;
string htmlCode = client.DownloadString(this.TextBox1.Text);
{
WebClient client = new WebClient();
client.Encoding = Encoding.UTF8;
string htmlCode = client.DownloadString(this.TextBox1.Text);
//(?<HTML><a[^>]*href\s*=\s*[\"\']?(?<HRef>[^"'>\s]*)[\"\']?[^>]*>(?<Title>[^<]+|.*?)?</a\s*>)
string regPattern = @"(?<HTML><a[^>]*href\s*=\s*[\""\']?(?<HRef>[^""'>\s]*)[\""\']?[^>]*>(?<Title>[^<]+|.*?)?</a\s*>)";
MatchCollection mc = Regex.Matches(htmlCode, regPattern);
string regPattern = @"(?<HTML><a[^>]*href\s*=\s*[\""\']?(?<HRef>[^""'>\s]*)[\""\']?[^>]*>(?<Title>[^<]+|.*?)?</a\s*>)";
MatchCollection mc = Regex.Matches(htmlCode, regPattern);
List<Href> list = new List<Href>();
foreach (Match match in mc)
{
list.Add(new Href(match.Groups[1].Value, match.Groups[2].Value, match.Groups[3].Value));
}
{
list.Add(new Href(match.Groups[1].Value, match.Groups[2].Value, match.Groups[3].Value));
}
this.GridView1.DataSource = list;
this.GridView1.DataBind();
}
}
this.GridView1.DataBind();
}
}
public class Href
{
private string _Tag;
private string _Url;
private string _Title;
{
private string _Tag;
private string _Url;
private string _Title;
public Href(string Tag, string Url, string Title)
{
_Tag = Tag;
_Url = Url;
_Title = Title;
}
{
_Tag = Tag;
_Url = Url;
_Title = Title;
}
public string Tag
{
set { _Tag = value; }
get { return _Tag; }
}
{
set { _Tag = value; }
get { return _Tag; }
}
public string Url
{
set { _Url = value; }
get { return _Url; }
}
{
set { _Url = value; }
get { return _Url; }
}
public string Title
{
set { _Title = value; }
get { return _Title; }
}
}
}
{
set { _Title = value; }
get { return _Title; }
}
}
}
執行結果:
參考網址:
沒有留言:
張貼留言