正则表达式相关:C# 抓取网页类(获取网页中所有信息)
发布时间:2020-12-13 22:08:45 所属栏目:百科 来源:网络整理
导读:类的代码: 1 using System; 2 using System.Data; 3 using System.Configuration; 4 using System.Net; 5 using System.IO; 6 using System.Text; 7 using System.Collections.Generic; 8 using System.Text.RegularExpressions; 9 using System.Threading;
类的代码: 1 using System; 2 using System.Data; 3 using System.Configuration; 4 using System.Net; 5 using System.IO; 6 using System.Text; 7 using System.Collections.Generic; 8 using System.Text.RegularExpressions; 9 using System.Threading; 10 using System.Web; 11 using System.Web.UI.MobileControls; 12 /// <summary> 13 /// 网页类 14 </summary> 15 public class WebPage 16 { 17 #region 私有成员 18 private Uri m_uri; //url 19 private List<Link> m_links; 此网页上的链接 20 private string m_title; 标题 21 string m_html; HTML代码 22 string m_outstr; 网页可输出的纯文本 23 bool m_good; 网页是否可用 24 int m_pagesize; 网页的大小 25 static Dictionary<string,CookieContainer> webcookies = new Dictionary<存放所有网页的Cookie 26 27 #endregion 28 29 #region 属性 30 31 32 通过此属性可获得本网页的网址,只读 33 34 string URL 35 { 36 get 37 { 38 return m_uri.AbsoluteUri; 39 } 40 } 41 42 43 通过此属性可获得本网页的标题,只读 44 45 string Title 46 { 47 48 { 49 if (m_title == "") 50 { 51 Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:w|W)*?)</title[^>]*>",RegexOptions.Multiline | RegexOptions.IgnoreCase); 52 Match mc = reg.Match(m_html); 53 if (mc.Success) 54 m_title = mc.Groups["title"].Value.Trim(); 55 } 56 return m_title; 57 } 58 } 59 string M_html 60 { 61 62 { 63 if (m_html == null) 64 { 65 m_html = ""; 66 } 67 return m_html; 68 } 69 } 70 71 此属性获得本网页的所有链接信息,只读 72 73 public List<Link> Links 74 { 75 76 { 77 if (m_links.Count == 0) getLinks(); 78 return m_links; 79 } 80 } 81 82 83 84 此属性返回本网页的全部纯文本信息,只读 85 86 string Context 87 { 88 89 { 90 if (m_outstr == "") getContext(Int16.MaxValue); 91 return m_outstr; 92 } 93 } 94 95 96 此属性获得本网页的大小 97 98 int PageSize 99 { 100 101 { 102 return m_pagesize; 103 } 104 } 105 106 此属性获得本网页的所有站内链接 107 108 public List<Link> InsiteLinks 109 { 110 111 { 112 return getSpecialLinksByUrl(^http://" + m_uri.Host,Int16.MaxValue); 113 } 114 } 115 116 117 此属性表示本网页是否可用 118 119 bool IsGood 120 { 121 122 { 123 return m_good; 124 } 125 } 126 127 此属性表示网页的所在的网站 128 129 string Host 130 { 131 132 { 133 return m_uri.Host; 134 } 135 } 136 137 138 139 140 从HTML代码中分析出链接信息 141 142 <returns>List<Link></returns> 143 private List<Link> getLinks() 144 { 145 0) 146 { 147 Regex[] regex = new Regex[2]; 148 regex[0] = <ashrefs*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a> RegexOptions.Singleline); 149 regex[1] = <[i]*frame[^><]+src=("|')?(?<url>([^>"'s)])+)("|')?[^>]*>",RegexOptions.IgnoreCase); 150 151 for (int i = 0; i < 2; i++) 152 { 153 Match match = regex[i].Match(m_html); 154 while (match.Success) 155 { 156 try 157 { 158 string url = HttpUtility.UrlDecode(new Uri(m_uri,match.Groups[URL"].Value).AbsoluteUri); 159 160 string text = 161 if (i == 0) text = (<[^>]+>)|(s)|( )|&|"text"].Value,""); 162 163 Link link = new Link(); 164 link.Text = text; 165 link.NavigateUrl = url; 166 167 m_links.Add(link); 168 } 169 catch (Exception ex) { Console.WriteLine(ex.Message); }; 170 match = match.NextMatch(); 171 } 172 } 173 } 174 175 } 176 177 此私有方法从一段HTML文本中提取出一定字数的纯文本 178 179 <param name="instr">HTML代码</param> 180 <param name="firstN">提取从头数多少个字181 <param name="withLink">是否要链接里面的字182 纯文本</returns> 183 string getFirstNchar(string instr,int firstN,255); line-height:1.5!important">bool withLink) 184 { 185 186 { 187 m_outstr = instr.Clone() as string; 188 m_outstr = (?m)<script[^>]*>(w|W)*?</script[^>]*>189 m_outstr = (?m)<style[^>]*>(w|W)*?</style[^>]*>190 m_outstr = (?m)<select[^>]*>(w|W)*?</select[^>]*>191 if (!withLink) m_outstr = (?m)<a[^>]*>(w|W)*?</a[^>]*>192 Regex objReg = new System.Text.RegularExpressions.Regex((<[^>]+?>)| 193 m_outstr = objReg.Replace(m_outstr,128); line-height:1.5!important">194 Regex objReg2 = (s)+195 m_outstr = objReg2.Replace(m_outstr,0); line-height:1.5!important">" "); 196 197 } 198 return m_outstr.Length > firstN ? m_outstr.Substring(0,firstN) : m_outstr; 199 } 200 201 202 #region 公有文法 203 204 此公有方法提取网页中一定字数的纯文本,包括链接文字 205 206 字数207 <returns></returns> 208 string getContext(int firstN) 209 { 210 return getFirstNchar(m_html,firstN,255); line-height:1.5!important">true); 211 } 212 213 214 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式 215 216 <param name="pattern">正则式217 <param name="count">返回的链接的个数218 219 public List<Link> getSpecialLinksByUrl(string pattern,255); line-height:1.5!important">int count) 220 { 221 222 List<Link> SpecialLinks = new List<Link>(); 223 List<Link>.Enumerator i; 224 i = m_links.GetEnumerator(); 225 int cnt = 0; 226 while (i.MoveNext() && cnt < count) 227 { 228 if (new Regex(pattern,RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success) 229 { 230 SpecialLinks.Add(i.Current); 231 cnt++; 232 } 233 } 234 return SpecialLinks; 235 } 236 237 238 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式 239 240 241 242 243 public List<Link> getSpecialLinksByText(244 { 245 246 List<Link> SpecialLinks = 247 List<Link>.Enumerator i; 248 i = m_links.GetEnumerator(); 249 250 251 { 252 RegexOptions.IgnoreCase).Match(i.Current.Text).Success) 253 { 254 SpecialLinks.Add(i.Current); 255 cnt++; 256 } 257 } 258 259 } 260 261 262 这公有方法提取本网页的纯文本中满足某正则式的文字 by 何问起 263 264 265 返回文字266 string getSpecialWords(string pattern) 267 { 268 269 Regex regex = 270 Match mc = regex.Match(m_outstr); 271 272 return mc.Groups[1].Value; 273 return string.Empty; 274 } 275 276 277 #region 构造函数 278 279 void Init(string _url) 280 { 281 282 { 283 m_uri = new Uri(_url); 284 m_links = 285 m_html = 286 m_outstr = 287 m_title = 288 m_good = true; 289 if (_url.EndsWith(.rar") || _url.EndsWith(.dat.msi")) 290 { 291 m_good = false; 292 return; 293 } 294 HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); 295 rqst.AllowAutoRedirect = 296 rqst.MaximumAutomaticRedirections = 3; 297 rqst.UserAgent = Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; 298 rqst.KeepAlive = 299 rqst.Timeout = 10000; 300 lock (WebPage.webcookies) 301 { 302 if (WebPage.webcookies.ContainsKey(m_uri.Host)) 303 rqst.CookieContainer = WebPage.webcookies[m_uri.Host]; 304 else 305 { 306 CookieContainer cc = new CookieContainer(); 307 WebPage.webcookies[m_uri.Host] = cc; 308 rqst.CookieContainer = cc; 309 } 310 } 311 HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse(); 312 Stream sm = rsps.GetResponseStream(); 313 if (!rsps.ContentType.ToLower().StartsWith(text/") || rsps.ContentLength > 1 << 22) 314 { 315 rsps.Close(); 316 m_good = 317 318 } 319 Encoding cding = System.Text.Encoding.Default; 320 string contenttype = rsps.ContentType.ToLower(); 321 int ix = contenttype.IndexOf(charset=322 if (ix != -1) 323 { 324 325 { 326 cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + charset".Length + 1)); 327 } 328 catch 329 { 330 cding = Encoding.Default; 331 } 332 333 该处视情况而定 有的需要解码 334 m_html = HttpUtility.HtmlDecode(new StreamReader(sm,cding).ReadToEnd()); 335 m_html = new StreamReader(sm,cding).ReadToEnd(); 336 337 } 338 339 { 340 341 342 343 m_html = 344 Regex regex = charset=(?<cding>[^=]+)?"345 string strcding = regex.Match(m_html).Groups[cding"].Value; 346 347 { 348 cding = Encoding.GetEncoding(strcding); 349 } 350 351 { 352 cding = Encoding.Default; 353 } 354 byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray()); 355 m_html = cding.GetString(bytes); 356 if (m_html.Split('?').Length > 100) 357 { 358 m_html = Encoding.Default.GetString(bytes); 359 } 360 } 361 m_pagesize = m_html.Length; 362 m_uri = rsps.ResponseUri; 363 rsps.Close(); 364 } 365 catch (Exception ex) 366 { 367 368 } 369 } 370 public WebPage(371 { 372 string uurl = 373 374 { 375 uurl = Uri.UnescapeDataString(_url); 376 _url = uurl; 377 } 378 catch { }; 379 Init(_url); 380 } 381 382 } (编辑:李大同) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |