
WebClIEnt wc = new WebClIEnt();// ....string downloadedfile = wc.DownloadString("http://www.myurl.com/"); 但是,有时该文件包含“有趣”的字符,如é至é,←到â†和フシギダネ到フã,·ã,®ãƒ€ãƒ.
我认为这可能与不同的unicode类型或某些东西有关,因为每个角色都会变成2个新角色,也许每一个角色分为两部分,但我在这方面的知识很少.你觉得是错的?
解决方法 这是一个包装的下载类,它支持gzip并检查编码头和元标记,以正确解码它.实例化类,并调用GetPage().
public class httpDownloader{ private Readonly string _referer; private Readonly string _userAgent; public EnCoding EnCoding { get; set; } public WebheaderCollection headers { get; set; } public Uri Url { get; set; } public httpDownloader(string url,string referer,string userAgent) { EnCoding = EnCoding.GetEnCoding("ISO-8859-1"); Url = new Uri(url); // verify the uri _userAgent = userAgent; _referer = referer; } public string GetPage() { httpWebRequest request = (httpWebRequest)WebRequest.Create(Url); if (!string.IsNullOrEmpty(_referer)) request.Referer = _referer; if (!string.IsNullOrEmpty(_userAgent)) request.UserAgent = _userAgent; request.headers.Add(httpRequestheader.AcceptEnCoding,"gzip,deflate"); using (httpWebResponse response = (httpWebResponse)request.GetResponse()) { headers = response.headers; Url = response.ResponseUri; return ProcessContent(response); } } private string ProcessContent(httpWebResponse response) { SetEnCodingFromheader(response); Stream s = response.GetResponseStream(); if (response.ContentEnCoding.Tolower().Contains("gzip")) s = new GZipStream(s,CompressionMode.Decompress); else if (response.ContentEnCoding.Tolower().Contains("deflate")) s = new DeflateStream(s,CompressionMode.Decompress); MemoryStream memStream = new MemoryStream(); int bytesRead; byte[] buffer = new byte[0x1000]; for (bytesRead = s.Read(buffer,buffer.Length); bytesRead > 0; bytesRead = s.Read(buffer,buffer.Length)) { memStream.Write(buffer,bytesRead); } s.Close(); string HTML; memStream.position = 0; using (StreamReader r = new StreamReader(memStream,EnCoding)) { HTML = r.ReadToEnd().Trim(); HTML = CheckMetaCharSetAndReEncode(memStream,HTML); } return HTML; } private voID SetEnCodingFromheader(httpWebResponse response) { string charset = null; if (string.IsNullOrEmpty(response.CharacterSet)) { Match m = Regex.Match(response.ContentType,@";\s*charset\s*=\s*(?<charset>.*)",RegexOptions.IgnoreCase); if (m.Success) { charset = m.Groups["charset"].Value.Trim(new[] { '\'','"' }); } } else { charset = response.CharacterSet; } if (!string.IsNullOrEmpty(charset)) { try { EnCoding = EnCoding.GetEnCoding(charset); } catch (ArgumentException) { } } } private string CheckMetaCharSetAndReEncode(Stream memStream,string HTML) { Match m = new Regex(@"<Meta\s+.*?charset\s*=\s*(?<charset>[A-Za-z0-9_-]+)",RegexOptions.Singleline | RegexOptions.IgnoreCase).Match(HTML); if (m.Success) { string charset = m.Groups["charset"].Value.Tolower() ?? "iso-8859-1"; if ((charset == "unicode") || (charset == "utf-16")) { charset = "utf-8"; } try { EnCoding MetaEnCoding = EnCoding.GetEnCoding(charset); if (EnCoding != MetaEnCoding) { memStream.position = 0L; StreamReader recodeReader = new StreamReader(memStream,MetaEnCoding); HTML = recodeReader.ReadToEnd().Trim(); recodeReader.Close(); } } catch (ArgumentException) { } } return HTML; }} 总结 以上是内存溢出为你收集整理的c# – 从互联网下载HTML后,字符串中的字符已更改全部内容,希望文章能够帮你解决c# – 从互联网下载HTML后,字符串中的字符已更改所遇到的程序开发问题。
如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)