查看: 2499|回复: 0

[.NET开发] Ioc在重构代码中的应用

发表于 2017-3-24 12:00:03

  最近lz在写抓工商公式系统(http://www.gsxt.gov.cn/index.html)的爬虫,其中的难点就是在怎么过极验验证码,搞的我不要不要的!如下:

简直是各种坑,被搞的死去活来以后还是解决了。现在回到主题!

  我们不是要抓工商公式系统的数据吗?所以我们先建两个实体BaseInfo(基本信息)和LegInfo(股东信息)

  1. public partial class BaseInfo
  2. {
  3. public BaseInfo()
  4. { }
  5. public BaseInfo(string html)
  6. { }
  7. #region Model
  8. /// <summary>
  9. ///
  10. /// </summary>
  11. public int Id
  12. {
  13. get;
  14. set;
  15. }
  16. /// <summary>
  17. /// 成立日期
  18. /// </summary>
  19. public string ApprDate
  20. {
  21. get;
  22. set;
  23. }
  24. /// <summary>
  25. /// 公司全称
  26. /// </summary>
  27. public string EntName
  28. {
  29. get;
  30. set;
  31. }
  32. /// <summary>
  33. /// 公司类型
  34. /// </summary>
  35. public string EntType
  36. {
  37. get;
  38. set;
  39. }
  40. /// <summary>
  41. /// 住所
  42. /// </summary>
  43. public string Dom
  44. {
  45. get;
  46. set;
  47. }
  48. /// <summary>
  49. /// 核准日期
  50. /// </summary>
  51. public string EstDate
  52. {
  53. get;
  54. set;
  55. }
  56. /// <summary>
  57. /// 法人
  58. /// </summary>
  59. public string Lerep
  60. {
  61. get;
  62. set;
  63. }
  64. /// <summary>
  65. /// 营业期限自
  66. /// </summary>
  67. public string OpFrom
  68. {
  69. get;
  70. set;
  71. }
  72. /// <summary>
  73. /// 营业期限至
  74. /// </summary>
  75. public string OpTo
  76. {
  77. get;
  78. set;
  79. }
  80. /// <summary>
  81. /// 经营范围
  82. /// </summary>
  83. public string OpScope
  84. {
  85. get;
  86. set;
  87. }
  88. /// <summary>
  89. /// 注册号
  90. /// </summary>
  91. public string RegNo
  92. {
  93. get;
  94. set;
  95. }
  96. /// <summary>
  97. /// 登记机关
  98. /// </summary>
  99. public string RegOrg
  100. {
  101. get;
  102. set;
  103. }
  104. /// <summary>
  105. /// 登记状态
  106. /// </summary>
  107. public string RegState
  108. {
  109. get;
  110. set;
  111. }
  112. /// <summary>
  113. /// 注册资本
  114. /// </summary>
  115. public string RegCap
  116. {
  117. set;
  118. get;
  119. }
  120. /// <summary>
  121. /// 行业领域
  122. /// </summary>
  123. public string IndcodeNameLv2 { get; set; }
  124. /// <summary>
  125. /// 省
  126. /// </summary>
  127. public string Province { get; set; }
  128. /// <summary>
  129. /// 市
  130. /// </summary>
  131. public string City { get; set; }
  132. /// <summary>
  133. /// 网址
  134. /// </summary>
  135. public string Weburl { get; set; }
  136. /// <summary>
  137. /// 评级
  138. /// </summary>
  139. public string Rating { get; set; }
  140. /// <summary>
  141. ///
  142. /// </summary>
  143. public int CompanyInfoId
  144. {
  145. get;
  146. set;
  147. }
  148. #endregion Model
  149. #region 导航属性
  150. public virtual CompanyInfo CompanyInfo { get; set; }
  151. #endregion
  152. }
  153. public partial class LegInfo : SpiderModel
  154. {
  155. public LegInfo()
  156. { }
  157. #region Model
  158. /// <summary>
  159. ///
  160. /// </summary>
  161. public int Id
  162. {
  163. get;
  164. set;
  165. }
  166. /// <summary>
  167. ///
  168. /// </summary>
  169. public string BlicNo
  170. {
  171. get;
  172. set;
  173. }
  174. /// <summary>
  175. ///
  176. /// </summary>
  177. public string BlicType
  178. {
  179. get;
  180. set;
  181. }
  182. /// <summary>
  183. ///
  184. /// </summary>
  185. public string ItemId
  186. {
  187. get;
  188. set;
  189. }
  190. /// <summary>
  191. ///
  192. /// </summary>
  193. public string Inv
  194. {
  195. get;
  196. set;
  197. }
  198. /// <summary>
  199. ///
  200. /// </summary>
  201. public string InvType
  202. {
  203. get;
  204. set;
  205. }
  206. /// <summary>
  207. ///
  208. /// </summary>
  209. public int CompanyInfoId
  210. {
  211. get;
  212. set;
  213. }
  214. public string CreateTimeStr { get; set; }
  215. public string MoneyRange { get; set; }
  216. public string Renjiao { get; set; }
  217. #endregion Model
  218. #region 导航属性
  219. /// <summary>
  220. /// 导航属性,公司。
  221. /// </summary>
  222. public virtual CompanyInfo CompanyInfo { get; set; }
  223. #endregion
  224. }
复制代码

先破解验证码,获取需要查询的公司的URL,然后抓取公司详情也的HTML(过程略);关键代码有两个方法GetBaseInfo和GetLegInfoes

如下:

  1. /// <summary>
  2. /// 获取工商基本数据
  3. /// </summary>
  4. /// <param name="url"></param>
  5. /// <param name="companyInfo"></param>
  6. public static BaseInfo GetGsxtInfo(string url, out string html)
  7. {
  8. HttpItem item = new HttpItem()
  9. {
  10. URL = url,//URL 必需项
  11. Method = "get",
  12. Referer = "http://www.gsxt.gov.cn/corp-query-homepage.html",
  13. Timeout = 10000
  14. };
  15. html = GetHtml(item);
  16. string companyName = GetXpathNode(html, "//h1[@class=\"fullName\"]");
  17. string companyNo = GetXpathNode(html, "//*[@class=\"nameBoxColor\"]");
  18. if (companyNo != "")
  19. {
  20. //CompanyInfo companyInfo = new CompanyInfo();
  21. //companyInfo.CompanyName = companyName;
  22. //companyInfo.CompanyNo = companyNo;
  23. //companyInfo.State = 1;
  24. //companyInfo.AddTime = DateTime.Now;
  25. //companyInfo.NextTime = DateTime.Now;
  26. //companyInfo.BaseInfos = new List<BaseInfo>();
  27. var baseInfo = new BaseInfo();
  28. baseInfo.EntName = companyName;
  29. baseInfo.RegNo = companyNo;
  30. baseInfo.ApprDate = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[4]/span[1]");
  31. baseInfo.RegState = GetXpathNode(html, "//*[@class=\"companyStatus\"]");
  32. baseInfo.EntType = GetXpathNode(html, "//div[@class=\"overview\"]/dl[3]/dd[1]");
  33. baseInfo.Lerep = GetXpathNode(html, "//div[@class=\"overview\"]/dl[4]/dd[1]");
  34. baseInfo.RegCap = GetXpathNode(html, "//div[@class=\"overview\"]/dl[5]/dd[1]");
  35. baseInfo.OpFrom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[7]/dd[1]");
  36. baseInfo.OpTo = GetXpathNode(html, "//div[@class=\"overview\"]/dl[8]/dd[1]");
  37. baseInfo.RegOrg = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[3]/span[1]");
  38. baseInfo.EstDate = GetXpathNode(html, "//div[@class=\"overview\"]/dl[10]/dd[1]");
  39. baseInfo.Dom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[12]/dd[1]");
  40. baseInfo.OpScope = GetXpathNode(html, "//div[@class=\"overview\"]/dl[13]/dd");
  41. return baseInfo;
  42. }
  43. else
  44. {
  45. return null;
  46. }
  47. }
  48. /// <summary>
  49. /// 股东信息
  50. /// </summary>
  51. /// <param name="html"></param>
  52. /// <param name="companyInfo"></param>
  53. /// <param name="draw"></param>
  54. /// <param name="start"></param>
  55. public static void GetLegInfoes(string html, ref List<LegInfo> reflegInfos, int draw = 1, int start = 0)
  56. {
  57. string url = string.Format("http://www.gsxt.gov.cn{0}", GetFirstInnerText(html, "var shareholderUrl = \"", "\""));
  58. //HttpHelper http = new HttpHelper();
  59. HttpItem item = new HttpItem()
  60. {
  61. URL = url,//URL 必需项
  62. Method = "post",
  63. Referer = "http://www.gsxt.gov.cn/corp-query-search-1.html",
  64. Postdata = string.Format("draw={0}&start=小贝&length=5", draw, start),
  65. ContentType = "application/x-www-form-urlencoded",
  66. Timeout = 10000
  67. };
  68. string rhtml = GetHtml(item); if (rhtml.Equals("")) { return; }
  69. var legInfoesListPage = JObject.Parse(rhtml);
  70. var legInfoesListList = legInfoesListPage["data"].ToList();
  71. //删除数据库中的数据
  72. //if (draw == 1)
  73. //{
  74. // if (legInfoesListList.Count > 0)
  75. // {
  76. // foreach (var leginfo in companyInfo.LegInfos.ToList())
  77. // {
  78. // companyInfo.LegInfos.Remove(leginfo);
  79. // }
  80. // }
  81. //}
  82. //add
  83. foreach (var legInfoes in legInfoesListList)
  84. {
  85. reflegInfos.Add(new LegInfo { BlicNo = GetText(legInfoes["bLicNo"].ToString().Replace("\"", "")), BlicType = legInfoes["blicType_CN"].ToString().Replace("\"", ""), ItemId = legInfoes["invId"].ToString().Replace("\"", ""), Inv = legInfoes["inv"].ToString().Replace("\"", ""), InvType = GetText(legInfoes["invType_CN"].ToString().Replace("\"", "")) });
  86. }
  87. ///下页
  88. if (int.Parse(legInfoesListPage["totalPage"].ToString()) > draw) //获取下一页的数据
  89. {
  90. draw++;
  91. start += 5;
  92. Console.WriteLine(string.Format("查询股东信息第{0}页", draw));
  93. GetLegInfoes(html, ref reflegInfos, draw, start);
  94. }
  95. }
复制代码

到这里为了完成任务写的代码,如果需要对代码让它更加优美,就需要用IOC的模式是重构它

先创建父类:

  1. public class SpiderModel
  2. {
  3. public SpiderModel()
  4. {
  5. }
  6. public SpiderModel(JToken token)
  7. {
  8. ToObje(token);
  9. }
  10. public virtual void ToObje(JToken token)
  11. {
  12. }
  13. public virtual SpiderModel ToObje(string html)
  14. {
  15. return new SpiderModel();
  16. }
  17. /// <summary>
  18. /// Xpath获取值
  19. /// </summary>
  20. /// <param name="html"></param>
  21. /// <param name="xpath"></param>
  22. /// <returns></returns>
  23. public static string GetXpathNode(string html, string xpath)
  24. {
  25. string result = string.Empty;
  26. #region Xpath提取
  27. try
  28. {
  29. HtmlDocument htmlDoc = new HtmlDocument();
  30. htmlDoc.LoadHtml(html);
  31. HtmlNode node = htmlDoc.DocumentNode.SelectSingleNode(xpath);
  32. if (node != null)
  33. {
  34. result = node.InnerHtml;
  35. result = new Regex("\\t", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, string.Empty);
  36. result = TextRemover.RemoveHTML(result);//去除html标签
  37. result = TextRemover.RemoveWhiteSpace(result).Trim();//去空白字符
  38. }
  39. }
  40. catch (Exception)
  41. {
  42. }
  43. return result;
  44. #endregion
  45. }
  46. public static string GetText(string result)
  47. {
  48. result = new Regex(@"<(p|br)[^<]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, "[$1]");
  49. result = new Regex("\\[p]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, "\r\n\r\n");
  50. result = new Regex("\\[br]", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, "\r\n");
  51. result = new Regex("\\t", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(result, " ");
  52. result = TextRemover.RemoveHTML(result);//去除HTML标签
  53. result = result.Replace("+", "");
  54. result = result.Trim();
  55. result = RegexHelper.RegexFilter(result.ToString().Replace("\"", ""), "([a-zA-Z0-9]+)", false, RegexOptions.None);
  56. return result;
  57. }
  58. }
复制代码

然后实体BaseInfo(基本信息)和LegInfo(股东信息)继承自SpiderModel

然后给BaseInfo(基本信息)和LegInfo(股东信息)重写函数

BaseInfo:

  1. public override SpiderModel ToObje(string html)
  2. {
  3. string companyName = GetXpathNode(html, "//h1[@class=\"fullName\"]");
  4. string companyNo = GetXpathNode(html, "//*[@class=\"nameBoxColor\"]");
  5. if (companyNo != "")
  6. {
  7. EntName = companyName;
  8. RegNo = companyNo;
  9. ApprDate = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[4]/span[1]");
  10. RegState = GetXpathNode(html, "//*[@class=\"companyStatus\"]");
  11. EntType = GetXpathNode(html, "//div[@class=\"overview\"]/dl[3]/dd[1]");
  12. Lerep = GetXpathNode(html, "//div[@class=\"overview\"]/dl[4]/dd[1]");
  13. RegCap = GetXpathNode(html, "//div[@class=\"overview\"]/dl[5]/dd[1]");
  14. OpFrom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[7]/dd[1]");
  15. OpTo = GetXpathNode(html, "//div[@class=\"overview\"]/dl[8]/dd[1]");
  16. RegOrg = GetXpathNode(html, "//*[@class=\"companyDetail clearfix\"]/span[3]/span[1]");
  17. EstDate = GetXpathNode(html, "//div[@class=\"overview\"]/dl[10]/dd[1]");
  18. Dom = GetXpathNode(html, "//div[@class=\"overview\"]/dl[12]/dd[1]");
  19. OpScope = GetXpathNode(html, "//div[@class=\"overview\"]/dl[13]/dd");
  20. }
  21. return this;
  22. }
复制代码

LegInfo:

  1. public override void ToObje(JToken token)
  2. {
  3. BlicNo = SpiderModel.GetText(token["bLicNo"].ToString().Replace("\"", ""));
  4. BlicType = token["blicType_CN"].ToString().Replace("\"", "");
  5. ItemId = token["invId"].ToString().Replace("\"", "");
  6. Inv = token["inv"].ToString().Replace("\"", "");
  7. InvType = GetText(token["invType_CN"].ToString().Replace("\"", ""));
  8. }
复制代码

最后要一个IOC管理类

  1. public class SpiderManage
  2. {
  3. public SpiderManage(HttpItem item)
  4. {
  5. this.Item = item;
  6. }
  7. public SpiderManage(HttpItem item,SpiderModel spiderModel)
  8. {
  9. this.Item = item;
  10. this.SpiderModel = spiderModel;
  11. }
  12. public string Html { get; set; }
  13. public HttpItem Item { get; set; }
  14. public List<SpiderModel> SpiderModelList{ get; set; }
  15. public SpiderModel SpiderModel { get; set; }
  16. public virtual string GetHtml()
  17. {
  18. int i = 3;
  19. while (i > 0)
  20. {
  21. i--;
  22. HttpHelper http = new HttpHelper();
  23. HttpResult result;
  24. object oj = new object();
  25. lock (oj)
  26. {
  27. Thread.Sleep(3000);
  28. result = http.GetHtml(Item);
  29. }
  30. if (result.StatusCode == System.Net.HttpStatusCode.OK)
  31. {
  32. string rhtml = result.Html;
  33. if (!rhtml.Equals("<script>window.location.href='/index/invalidLink'</script>"))
  34. {
  35. Html = result.Html;
  36. return Html;
  37. }
  38. }
  39. }
  40. Html = "";
  41. return Html;
  42. }
  43. public SpiderModel GetOjb()
  44. {
  45. return SpiderModel.ToObje(Html);
  46. }
  47. public void toList()
  48. {
  49. // SpiderModelList
  50. //SpiderModel
  51. }
  52. }
复制代码

最后面我就只要 List sManageList用于保存对象就可以了

重新改写前面的GetBaseInfo和GetLegInfoes函数。

  1. public static List<SpiderManage> sManageList = new List<SpiderManage>();
  2. //////////////////////////////////////////////
  3. public static void GetLegInfoes(string html, int draw = 1, int start = 0)
  4. {
  5. string url = string.Format("http://www.gsxt.gov.cn{0}", GetFirstInnerText(html, "var shareholderUrl = \"", "\""));
  6. HttpItem item = new HttpItem()
  7. {
  8. URL = url,//URL 必需项
  9. Method = "post",
  10. Referer = "http://www.gsxt.gov.cn/corp-query-search-1.html",
  11. Postdata = string.Format("draw={0}&start=小贝&length=5", draw, start),
  12. ContentType = "application/x-www-form-urlencoded",
  13. Timeout = 10000
  14. };
  15. SpiderManage sManage = new SpiderManage(item);
  16. sManage.SpiderModel = new LegInfo();
  17. sManage.GetHtml();
  18. string rhtml = sManage.Html;
  19. if (rhtml.Equals(""))
  20. { return; }
  21. var legInfoesListPage = JObject.Parse(rhtml);
  22. sManageList.Add(sManage);
  23. //var legInfoesListList = legInfoesListPage["data"].ToList();
  24. //删除数据库中的数据
  25. //add
  26. //foreach (var legInfoes in legInfoesListList)
  27. // {
  28. // reflegInfos.Add(new LegInfo { BlicNo = GetText(legInfoes["bLicNo"].ToString().Replace("\"", "")), BlicType = legInfoes["blicType_CN"].ToString().Replace("\"", ""), ItemId = legInfoes["invId"].ToString().Replace("\"", ""), Inv = legInfoes["inv"].ToString().Replace("\"", ""), InvType = GetText(legInfoes["invType_CN"].ToString().Replace("\"", "")) });
  29. // }
  30. ///下页
  31. while (int.Parse(legInfoesListPage["totalPage"].ToString()) > draw) //获取下一页的数据
  32. {
  33. draw++;
  34. start += 5;
  35. sManageList.Add(new SpiderManage(new HttpItem()
  36. {
  37. URL = url,//URL 必需项
  38. Method = "post",
  39. Referer = "http://www.gsxt.gov.cn/corp-query-search-1.html",
  40. Postdata = string.Format("draw={0}&start=小贝&length=5", draw, start),
  41. ContentType = "application/x-www-form-urlencoded",
  42. Timeout = 10000
  43. }, new LegInfo()));
  44. //Console.WriteLine(string.Format("查询股东信息第{0}页", draw));
  45. //GetLegInfoes(html, ref reflegInfos, draw, start);
  46. }
  47. }
复制代码

后面怎么用就不讨论了,只要是把sManageList拿过去调度分配抓取就可以了




上一篇:简单工厂
下一篇:简单工厂
回复

使用道具 举报