码迷,mamicode.com
首页 > 其他好文 > 详细

爬取当当网的图书信息之封装一个工具类

时间:2016-11-27 14:07:44      阅读:202      评论:0      收藏:0      [点我收藏+]

标签:return   key   lis   des   length   商品   targe   www   阅读   

把这个类名取为Tool

封装一个下载网页的方法GetHtml

public static string GetHtml(string url)
        {
            try
            {
                WebClient wb = new WebClient();
                return  wb.DownloadString(url);
            
            }
            catch
            {
                return "";
            }
        }

传入的是这个网页的URL,这个方法能帮我们把网页下载下来
封装一个匹配图书类URL的的方法
public static ArrayList GetList(string html)
        {
            
            ArrayList list = new ArrayList();
            MatchCollection matches = Regex.Matches(html, "http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html");
            for (int i = 0; i < matches.Count; i++)
            {
                if (!list.Contains(matches[i].Value.ToString()))//去重
                {
                    list.Add(matches[i].Value.ToString());
                }
            }
            return list;
        }
这里使用了正则http://category.dangdang.com/cp[0-9]{2}.[0-9]{2}.[0-9][1-9].00.00.00.html来匹配URL地址
封装一个一个获取图书类名的方法
 public static string  GetBookClassName(string html)
        {
            // <meta name="keywords" content="计算机/网络,家庭与办公室用书" />
            //计算机/网络、家庭与办公室用书等商品
            string name = "";
            MatchCollection matches = Regex.Matches(html, "<meta name=\"keywords\" content=\".{1,30}\" />");
            if (matches.Count>0)
            {
               string temp= matches[0].ToString();
              int x= temp.IndexOf("/");
              int y = temp.LastIndexOf(">");
              if (y-x>4)
              {
                  name = temp.Substring(x + 1, y - x - "\" />".Length);
              }
             
            }
            return name;
        }

查看网页的源代码

 <meta name="keywords" content="计算机/网络,家庭与办公室用书" />

图书类名就在这里 接着我们使用正则把它抓取到

接下来我们要抓取每个图书类别共有多少页

技术分享

 

 public static int GetPages(string html)
        {
            int result = 1;
            MatchCollection matches = Regex.Matches(html, "<li class=\"page_input\"><span>共[0-9]{1,4}页 到第</span>");
            if (matches.Count > 0)
            {
                string temp = matches[0].ToString();
                int y1 = temp.IndexOf("共", 0);
                int y2 = temp.IndexOf("页", y1);
                if (y1>0&&y2>0)
                {
                    string page = temp.Substring(y1 + "共".Length, y2 - y1 - "共".Length);
                    result = int.Parse(page);
                }

            }
            return result;
        }

处理好BookClass接下来处理Book了

获取图书详细页面的URL

 

 public static ArrayList GetProduct(string html)
        {
            //http://product.dangdang.com/22862060.html
            ArrayList list = new ArrayList();
            MatchCollection matches = Regex.Matches(html, "http://product.dangdang.com/[0-9]{8}.html");
            for (int i = 0; i < matches.Count; i++)
            {
                Console.WriteLine(matches[i].Value.ToString());
                if (!list.Contains(matches[i].Value.ToString()))
                list.Add(matches[i].Value.ToString());
            }

            return list;
        }

 

封装一个方法,待爬虫获取图书详细页来抓取图书信息

 public static Dictionary<int, string> analysis(string html)
        {
            string BookName = "";
            string price = "0";
            string author = "";
            string publisher = "";
            string imgurl = "";
            string Content = "";

            Dictionary<int, string> dictionary = new Dictionary<int, string>();


            MatchCollection matches = Regex.Matches(html, " <span class=\"yen\">&yen;</span>.{1,4}.[0-9]{2}");
            if (matches.Count > 0)
            {
                string temp = matches[0].ToString();
                int y1 = temp.IndexOf("</span>", 0);
              
                if (y1>0)
                    price = temp.Substring(y1 + "</span>".Length, temp.Length - y1 - "</span>".Length);

            }



            matches = Regex.Matches(html, "<title>.*</title>");
            if (matches.Count > 0)
            {
                string thtml = matches[0].ToString();
                int n1 = thtml.IndexOf("《", 0);
                if (n1 > 0)
                {
                    int n2 = thtml.IndexOf("》", n1);
                    if (n2 > n1)
                        BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                    else
                    {
                        n2 = thtml.IndexOf("【简介_书评_在线阅读】 - 当当图书", n1);
                        if (n2 > n1)
                            BookName = thtml.Substring(n1 + 1, n2 - n1 - 1);
                    }
                }
               
            }
            //作者:<a href="http://www.dangdang.com/author/%40%C7%EF%D2%B6_1"  target="_blank" >@秋叶</a>
            //>作者:<a href="http://www.dangdang.com/author/Marty_1"  target="_blank" >Marty</a>
            int a1 = html.IndexOf("target=\"_blank\" dd_name=\"作者\">",0);
            if (a1 > 0)
            {
                int   a2 = html.IndexOf("</a>", a1);
                if (a2>a1)
                {
                    author = html.Substring(a1 + "target=\"_blank\" dd_name=\"作者\">".Length, a2 - a1 - "target=\"_blank\" dd_name=\"作者\">".Length);
                }
            }


            //  target="_blank" dd_name="出版社">
            int p1 = html.IndexOf("target=\"_blank\" dd_name=\"出版社\">", 0);
            if (p1 > 0)
            {

                int p2 = html.IndexOf("</a>", p1);
                if (p2>0)
                {
                    publisher = html.Substring(p1 + "target=\"_blank\" dd_name=\"出版社\">".Length, p2 - p1 - "target=\"_blank\" dd_name=\"出版社\">".Length);
                }
            }
            //<img src="http://img3x6.ddimg.cn/88/36/23845426-1_u_5.jpg" alt="" height="800" width="800">
            //    <img src="http://img3x0.ddimg.cn/52/15/23465230-1_u_1.jpg" alt="" height="800" width="800">
            // <img src="http://img3x6.ddimg.cn/45/19/23915376-1_u_6.jpg" alt="" height="800" width="800">
            matches = Regex.Matches(html, "http://img3x[0-9].ddimg.cn/[0-9]{2}/[0-9]{2}/[0-9]{8}-[0-9]_u_[0-9].jpg");
            if (matches.Count > 0)
            {
                imgurl = matches[0].ToString();

            }



            //content
            int c1 = html.IndexOf("<meta name=\"description\" content=\"");
            if (c1>0)
            {
                int c2 = html.IndexOf("\">", c1);
                if (c2>0)
                {
                    Content = html.Substring(c1 + "<meta name=\"description\" content=\"".Length, c2 - c1 - "<meta name=\"description\" content=\"".Length);
                }
            }



            dictionary.Add(1, BookName);
            dictionary.Add(2, price);
            dictionary.Add(3, author);
            dictionary.Add(4, publisher);
            dictionary.Add(5, imgurl);
            dictionary.Add(6, Content);



            return dictionary;
        }

 

Tool类完成

 

 

爬取当当网的图书信息之封装一个工具类

标签:return   key   lis   des   length   商品   targe   www   阅读   

原文地址:http://www.cnblogs.com/zuin/p/6106196.html

(0)
(0)
   
举报
评论 一句话评论(0
登录后才能评论!
© 2014 mamicode.com 版权所有  联系我们:gaon5@hotmail.com
迷上了代码!