﻿using System;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Data;
using System.Web;
using System.Web.UI.WebControls;

public partial class _Default : System.Web.UI.Page
{
    protected void Page_Load(object sender, EventArgs e)
    {
        if (!IsPostBack)
        {
            //最新10条
            BindData(repTop10, 10);
            //数据列表
            BindData(repData, 1000);
        }
    }

    /// <summary>
    /// 绑定已抓取数据
    /// </summary>
    private void BindData(Repeater rep, int num)
    {
        DataTable dt = BLL.Pager.GetPager(num, 1, "", " Time DESC ", "Article", "Id,Title,Con,Source,Time,Hits");
        rep.DataSource = dt;
        rep.DataBind();
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 抓取
    /// </summary>
    /// <param name="sender"></param>
    /// <param name="e"></param>
    protected void btnCrawl_Click(object sender, EventArgs e)
    {
        btnCrawl.Enabled = false;
        string news = PageOperate.GetNullToString(ddlNewsSource.SelectedValue);
        string words = PageOperate.GetNullToString(txtWords.Text.Trim());
        int nums = PageOperate.GetIntValue(ddlNums.SelectedValue);

        if (news == "")
        {
            PageOperate.AlertAndRedirect("请选择新闻来源！", "Default.aspx");
            return;
        }

        if (words == "")
        {
            PageOperate.AlertAndRedirect("请填写关键字！", "Default.aspx");
            return;
        }

        if (nums == 0)
        {
            PageOperate.AlertAndRedirect("请选择抓取的数量！", "Default.aspx");
            return;
        }

        //要抓取的url地址
        string url = "";
        //抓取到url地址页面的html
        string html = "";
        //当前页数
        int p = 1;
        //抓取到的数量
        int crawlNum = 0;
        //循环参数
        bool flag = true;
        //百度贴吧参数 相当于当前页 以50为单位递增
        int pn = 0;

        lblShow.Text = "正在抓取.....";
        #region 抓取数据
        if (news == "新浪")
        {
            try
            {
                //对关键字进行url编码，防止出现乱码
                words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
                do
                {
                    url = "http://search.sina.com.cn/?q=" + words + "&range=title&c=news&sort=time&col=&source=&from=&country=&size=&time=&a=&page=" + p + "&pf=2131425478&ps=2134309112&dpc=1";
                    html = GetHtml(url);
                    if (html != "error")
                    {
                        string matHtml = ResolverAndOutput(html, "", "", "<h2><a href=\"http://(?<content>.+?)</span></h2>", 1, false);
                        string[] itemArray = matHtml.Replace("~", "").Split('$');
                        //flag = false; //itemArray.Length > nums;
                        //循环获取标题
                        for (int j = 0; j < itemArray.Length - 1; j++)
                        {
                            lblShow.Text = "分析到第" + p + " 页，第" + (j + 1) + "条数据，已采集" + crawlNum + "条数据！";
                            //抓取到的数量如果与选择的数量一致，则退出抓取
                            if (crawlNum == nums)
                            {
                                flag = false;
                                break;
                            }
                            //标题 标题清除html标签
                            string title = CutString(itemArray[j], "target=\"_blank\">", "</a>");
                            title = PageOperate.CutHTML(title).Trim();
                            //判断此标题是否已经添加到数据库
                            DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" + title + "'");
                            if (dt.Rows.Count > 0)
                                continue;

                            //内容页链接
                            string conUrl = CutString(itemArray[j], "<h2><a href=\"", "\" target=\"_blank\"");
                            int splitIndex = conUrl.LastIndexOf('/');
                            if (splitIndex < 0)
                                continue;
                            //获得更有效最新技术源码看这里：http://www.51aspx.com/
                            //根据内容页链接，获取内容
                            string conPage = GetHtml(conUrl);
                            if (conPage != "error")
                            {
                                string conHtml = "";
                                int endIndex = 0;
                                int isExist = conPage.IndexOf("<div id=\"divContent\"");
                                if (isExist > 0)
                                {
                                    conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"divContent\"(?<content>.+?)<div id=\"divAttachment\">", 1, false);
                                    endIndex = conHtml.IndexOf("<div id=\"divAttachment\">");
                                }
                                else
                                {
                                    int tempindex = conPage.IndexOf("<div class=\"blkContainerSblkCon BSHARE_POP\"");
                                    if (tempindex > 0)
                                    {
                                        if (conPage.IndexOf("<div class=\"se_edit\"") > 0)
                                        {
                                            conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>.+?)<div class=\"se_edit\"", 1, false);
                                            endIndex = conHtml.IndexOf("<div class=\"se_edit\"");
                                        }
                                        else if (conPage.IndexOf("<div class=\"wb_rec\" id=\"wb_rec\" style") > 0)
                                        {
                                            conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>.+?)<div class=\"wb_rec\" id=\"wb_rec\" style", 1, false);
                                            endIndex = conHtml.IndexOf("<div class=\"wb_rec\" id=\"wb_rec\" style");
                                        }
                                        else if (conPage.IndexOf("<iframe width=\"100%\" scrolling=\"no\" height=\"35\"") > 0)
                                        {
                                            conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"blkContainerSblkCon BSHARE_POP\"(?<content>.+?)<iframe width=\"100%\" scrolling=\"no\" height=\"35\"", 1, false);
                                            endIndex = conHtml.IndexOf("<iframe width=\"100%\" scrolling=\"no\" height=\"35\"");
                                        }
                                        else
                                            continue;
                                    }
                                    else
                                        continue;
                                }
                                if (endIndex < 0)
                                    continue;
                                conHtml = conHtml.Substring(0, endIndex);
                                conHtml = Server.HtmlEncode(conHtml);
                                int result = AddData("新浪", title, conHtml, "Corp");
                                if (result > 0)
                                    crawlNum++;
                            }
                        }
                    }
                    else
                    {
                        flag = false;
                        break;
                    }
                    p++;
                } while (flag);
            }
            catch
            {

            }
        }
        else if (news == "百度贴吧")
        {
            try
            {
                //对关键字进行url编码，防止出现乱码
                words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
                do
                {
                    url = "http://tieba.baidu.com/f?kw=" + words + "&pn=" + pn;
                    html = GetHtml(url);
                    if (html != "error")
                    {
                        string matHtml = ResolverAndOutput(html, "", "", "<div class=\"threadlist_text threadlist_title(?<content>.+?)</a><span ></span></div>", 1, false);
                        string[] itemArray = matHtml.Replace("~", "").Split('$');
                        //flag = false; //itemArray.Length > nums;
                        //循环获取标题
                        for (int j = 0; j < itemArray.Length - 1; j++)
                        {
                            lblShow.Text = "分析到第" + ((pn / 50) + 1) + " 页，第" + (j + 1) + "条数据，已采集" + crawlNum + "条数据！";
                            //抓取到的数量如果与选择的数量一致，则退出抓取
                            if (crawlNum == nums)
                            {
                                flag = false;
                                break;
                            }
                            //标题 标题清除html标签
                            string title = CutString(itemArray[j], "class=\"j_th_tit\">", "</a>");
                            title = PageOperate.CutHTML(title).Trim();
                            //判断此标题是否已经添加到数据库
                            DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" + title + "'");
                            if (dt.Rows.Count > 0)
                                continue;

                            //内容页链接
                            string conUrl = CutString(itemArray[j], "<a href=\"", "\" title=\"");
                            conUrl = "http://tieba.baidu.com" + conUrl;
                            int splitIndex = conUrl.LastIndexOf('/');
                            if (splitIndex < 0)
                                continue;

                            //根据内容页链接，获取内容
                            string conPage = GetHtml(conUrl);
                            if (conPage != "error")
                            {
                                string conHtml = "";
                                int isExist = conPage.IndexOf("<cc><div id=\"post_content_");
                                if (isExist > 0)
                                {
                                    conHtml = ResolverAndOutput(conPage, "", "", "<cc><div id=\"post_content_(?<content>.+?)</div></cc>", 1, false);
                                    string[] conArray = conHtml.Replace("~", "").Split('$');
                                    conHtml = conArray[0];
                                }
                                else
                                    continue;

                                conHtml = Server.HtmlEncode(conHtml);
                                int result = AddData("百度贴吧", title, conHtml, "Corp");
                                if (result > 0)
                                    crawlNum++;
                            }
                        }
                    }
                    else
                    {
                        flag = false;
                        break;
                    }
                    pn = pn + 50;
                } while (flag);
            }
            catch
            {

            }
        }
        else if (news == "搜狗")
        {
            try
            {
                //对关键字进行url编码，防止出现乱码
                words = HttpUtility.UrlEncode(words, System.Text.Encoding.GetEncoding("gb2312"));
                do
                {
                    url = "http://news.sogou.com/news?query=" + words + "&sut=2543&sst0=1396574960819&mode=2&x=30&y=9&page=" + p + "&w=01029901&dr=1";
                    html = GetHtml(url);
                    if (html != "error")
                    {
                        string matHtml = ResolverAndOutput(html, "", "", "<h3 class=\"pt\">(?<content>.+?)</h3>", 1, false);
                        string[] itemArray = matHtml.Replace("~", "").Split('$');
                        //flag = false; //itemArray.Length > nums;
                        //循环获取标题
                        for (int j = 0; j < itemArray.Length - 1; j++)
                        {
                            lblShow.Text = "分析到第" + p + " 页，第" + (j + 1) + "条数据，已采集" + crawlNum + "条数据！";
                            //抓取到的数量如果与选择的数量一致，则退出抓取
                            if (crawlNum == nums)
                            {
                                flag = false;
                                break;
                            }
                            //标题 标题清除html标签
                            string title = CutString(itemArray[j], "target=\"_blank\">", "</a>");
                            title = PageOperate.CutHTML(title).Trim();
                            //判断此标题是否已经添加到数据库
                            DataTable dt = BLL.Pager.GetPager("Id,Title", "Article", "Title = '" + title + "'");
                            if (dt.Rows.Count > 0)
                                continue;

                            //内容页链接
                            string conUrl = CutString(itemArray[j], "<a class=\"pp\" href=\"", "\" id=\"uigs_");
                            int splitIndex = conUrl.LastIndexOf('/');
                            if (splitIndex < 0)
                                continue;
                            //获得更有效最新技术源码看这里：http://www.51aspx.com/
                            //根据内容页链接，获取内容
                            string conPage = GetHtml(conUrl);
                            if (conPage != "error")
                            {
                                string conHtml = "";
                                int endIndex = 0;
                                //搜狐新闻
                                if (conPage.IndexOf("<div class=\"text clear\" id=\"contentText\"") > 0)
                                {
                                    if (conPage.IndexOf("<div class=\"autoShare clear\">") > 0)
                                    {
                                        conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"text clear\" id=\"contentText\"(?<content>.+?)<div class=\"autoShare clear\">", 1, false);
                                        endIndex = conHtml.IndexOf("<div class=\"autoShare clear\">");
                                    }
                                    else if (conPage.IndexOf("<div class=\"original-title\">") > 0)
                                    {
                                        conHtml = ResolverAndOutput(conPage, "", "", "<div class=\"text clear\" id=\"contentText\"(?<content>.+?)<div class=\"original-title\">", 1, false);
                                        endIndex = conHtml.IndexOf("<div class=\"original-title\">");
                                    }
                                    else
                                        continue;
                                }//腾讯新闻
                                else if (conPage.IndexOf("<div id=\"Cnt-Main-Article-QQ\"") > 0)
                                {
                                    conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"Cnt-Main-Article-QQ\"(?<content>.+?)<span style=\"width:0;height:0;", 1, false);
                                    endIndex = conHtml.IndexOf("<span style=\"width:0;height:0;");
                                }//网易新闻
                                else if (conPage.IndexOf("<div id=\"endText\">") > 0)
                                {
                                    conHtml = ResolverAndOutput(conPage, "", "", "<div id=\"endText\">(?<content>.+?)<div class=\"sharecommend-wrap clearfix\">", 1, false);
                                    endIndex = conHtml.IndexOf("<div class=\"sharecommend-wrap clearfix\">");
                                }
                                else
                                    continue;

                                if (endIndex < 0)
                                    continue;
                                conHtml = conHtml.Substring(0, endIndex);
                                conHtml = Server.HtmlEncode(conHtml);
                                int result = AddData("搜狗", title, conHtml, "Corp");
                                if (result > 0)
                                    crawlNum++;
                            }
                        }
                    }
                    else
                    {
                        flag = false;
                        break;
                    }
                    p++;
                } while (flag);
            }
            catch
            {
                lblShow.Text = "抓取数据出现异常！";
                lblShow.ForeColor = System.Drawing.Color.Red;
                return;
            }
        }
        //最新10条
        BindData(repTop10, 10);
        //数据列表
        BindData(repData, 1000);
        btnCrawl.Enabled = true;
        #endregion
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 把抓取到的信息添加到数据库
    /// </summary>
    /// <param name="type">网站来源</param>
    /// <param name="title">标题</param>
    /// <param name="conHtml">内容</param>
    /// <param name="login">登录方式</param>
    /// <returns></returns>
    private int AddData(string type, string title, string conHtml, string login)
    {
        Model.Article m = new Model.Article();
        m.Title = PageOperate.GetNullToString(title);
        m.Con = conHtml;
        m.Source = type;
        m.Time = DateTime.Now;
        m.Hits = 0;
        return BLL.Article.Add(m);
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 根据url获取页面html
    /// </summary>
    /// <param name="url">网站url</param>
    /// <returns></returns>
    private static string GetHtml(string url)
    {
        string htmlStr = "";
        try
        {
            WebRequest request = WebRequest.Create(url);
            WebResponse response = request.GetResponse();
            StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("gb2312"));
            htmlStr = reader.ReadToEnd();
            reader.Close();
        }
        catch
        {
            htmlStr = "error";
        }
        return htmlStr;
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 对抓取到的网页进行分析组合成有规律的数组，不过滤html
    /// </summary>
    /// <param name="result">result：抓取后待分析的网页</param>
    /// <param name="regexStr">regexStr：对整个网页进行正则截取时，正则开始标签</param>
    /// <param name="regexEnd">regexEnd：对整个网页进行正则截取时，正则结束标签</param>
    /// <param name="regexTab">regexTab：确定抓取范围后匹配某列的正则</param>
    /// <param name="ColNum">共有几列</param>
    /// <param name="IsRemoveHtml">是否移除html</param>
    /// <returns></returns>
    private static string ResolverAndOutput(string result, string regexStr, string regexEnd, string regexTab, int ColNum, bool IsRemoveHtml)
    {
        string strTempContent = "";
        string patternStart = regexStr;         //表达式开始标签,regexStr
        string patternEnd = regexEnd;           //表达式结束标签,regexEnd
        string regex = patternStart + @"([\s\S]*)" + patternEnd;             //组合后的表达式 

        strTempContent = GetPatternHtml(regex, result, ColNum);              //通过正则表达式获得所需信息的大table
        if (strTempContent != "error")
        {
            strTempContent = strTempContent.Replace("\n", "");               //去掉\n符
            strTempContent = strTempContent.Replace("></td>", "> </td>");    //在<td></td>之间加入空字符，以便被正则
            string regex2 = regexTab;                                        //确定抓取范围后匹配某列的正则，regexTab
            strTempContent = GetPatternHtml(regex2, strTempContent, ColNum); //正则找到每列值
            if (IsRemoveHtml == true)
                strTempContent = RemoveHtml(strTempContent);                 //正则移除html标签
            if (strTempContent != "error")
                return strTempContent;
            else
            {
                string tmpNull = "";
                for (int i = 0; i < ColNum; i++)
                {
                    tmpNull = tmpNull + "Null$";
                }
                return tmpNull + "?";
            }
        }
        else
        {
            string retNull = "";
            for (int i = 0; i < ColNum; i++)
            {
                retNull = retNull + "Null$";
            }
            return retNull + "?";
        }
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 通过正则表达式，获取要得到的信息。
    /// </summary>
    /// <param name="pattern">传入正则表达式</param>
    /// <param name="tHtml">传入被正则的html</param>
    /// <param name="Col">要被正则的html有几列。</param>
    /// <returns></returns>
    private static string GetPatternHtml(string pattern, string tHtml, int Col)
    {
        Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase);
        MatchCollection mc = regex.Matches(tHtml);
        string strTempContent = "";
        if (mc.Count > 0)
        {
            int num = 1;
            foreach (Match matI in mc)
            {
                strTempContent += matI.Groups[0].Value + "$";
                if (num % Col == 0)
                {
                    strTempContent += "~";
                }
                num = num + 1;
            }
        }
        else
        {
            strTempContent = "error";
        }
        return strTempContent;
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 移除html标签
    /// </summary>
    /// <param name="html">传入html字符串</param>
    /// <returns>移除Html标签后的Html字符串</returns>
    private static string RemoveHtml(string html)
    {
        string m_outstr = "";
        m_outstr = html.Clone() as string;
        m_outstr = new Regex(@"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
        m_outstr = new Regex(@"(?m)<style[^>]*>(\w|\W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
        m_outstr = new Regex(@"(?m)<select[^>]*>(\w|\W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, "");
        Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase);
        m_outstr = objReg.Replace(m_outstr, "");
        Regex objReg2 = new System.Text.RegularExpressions.Regex("(\\s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase);
        m_outstr = objReg2.Replace(m_outstr, " ");
        return m_outstr;
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 返回截取html后的字符串
    /// </summary>
    /// <param name="pageHtml">要截取的字符串</param>
    /// <param name="starts">截取起点</param>
    /// <param name="ends">截取终点</param>
    /// <returns>截取后的字符串</returns>
    private static string CutString(string pageHtml, string starts, string ends)
    {
        string keyText = "";
        int StrLen = starts.Length;
        if (starts.Trim() != "" || ends.Trim() != "")
        {
            int m = pageHtml.IndexOf(starts.Trim());                             //找出截取起点的位置
            if (m == -1)
            {
                return "";                                                       //没有查找到数据，直接返回空
            }
            string pageText = pageHtml.Remove(0, m + StrLen);                    //删除截取起点以上的html文本
            if (!string.IsNullOrEmpty(ends))
            {
                int n = pageText.IndexOf(ends.Trim());                           //找出截取终点的位置
                keyText = pageText.Remove(n - 0);                                //删除截取终点以下的html文本
            }
            else
            {
                keyText = pageText;
            }
        }
        else
        {
            keyText = pageHtml;
        }
        keyText = keyText.Replace("\n", "");
        return keyText;
    }
    //获得更有效最新技术源码看这里：http://www.51aspx.com/
    /// <summary>
    /// 清楚内容中html标签，用于显示在列表中
    /// </summary>
    /// <param name="con"></param>
    /// <param name="len"></param>
    /// <returns></returns>
    public string Con(string con, int len)
    {
        con = PageOperate.GetNullToString(Server.HtmlDecode(con));
        if (con != "")
            return PageOperate.CutString(PageOperate.CutHTML(con), len);
        else
            return "&nbsp;";
    }
}