/*
 * 采集并保存链接
 */
package mall.kgmall.spider.bean;
import dao.MallScSpiderlinkconfig;
import service.entryService.MallScSpiderlinkconfigEntryService;
import dao.MallScSpiderurls;
import java.lang.Thread;


import dao.MallScSpiderjob;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;


import mall.kgmall.spider.bean.ArticleUrlsBean;
import mall.kgmall.spider.bean.HttpFileBean;
import mall.kgmall.spider.web.Html;

import service.entryService.MallScSpiderurlsEntryService;
import service.entryService.MallScSpiderjobEntryService;
import dao.MallScSpidersites;
import service.entryService.MallScSpidersitesEntryService;


import mall.security.portal.core.MallSecurityUserUtil;
import mall.security.portal.core.MallUserRoseConfig;
import net.juluu.app.security.core.PortalUserLoginConfig;
import net.juluu.app.security.core.SecurityActionCheckUtil;
import net.juuluu.spider.core.SpiderPath;
import net.juuluu.spider.core.web.ArticleHtmlProcessor;
public class ArtileUrlSpider extends Thread {
	private String listPagePathReplaceSTR="{*}";
	private HttpFileBean httpFileBean;
	private int hostPort =80;
	/*listPageLinksURLList
	 * //thread 使用的数据，当前使用一个thread 多个thread 要
	 * 添加同步代码
	 */
	private ArrayList<String> listPageLinksURLList;
	private MallScSpiderjob ScSpiderjob;
	private boolean isworking;
	private 	SpiderPath spiderPath=new SpiderPath();
	private StringBuffer log=new StringBuffer();
	private boolean spiderend=false;
	public void run(){
		isworking=true;
		ScSpiderurls();
		//System.out.println(getHtml().getLog().toString());
		//System.out.println(getHtml().getHtmlContent());
		System.out.println(getLog().toString());
		
		isworking=false;
		}
	
	//获取采集链接，与旧链接库比较，只采集最新链接
	private MallScSpiderlinkconfig linkconfig;
	private ArticleUrlsBean ArticleUrlsBean;
	private Html html;
	
	private ArticleHtmlProcessor ArticleHtmlProcessor;
	
	public ArtileUrlSpider(MallScSpiderjob job,ArticleUrlsBean ArticleUrlsBean){
	
		this.linkconfig=	MallScSpiderlinkconfigEntryService
				.getInstance()
				.getLinkConfigByJobId(job.getJobId());
	 html=new Html();
	  ArticleHtmlProcessor= ArticleHtmlProcessor.getInstance();
	this.ArticleUrlsBean=ArticleUrlsBean;
	this.httpFileBean=new HttpFileBean();
	this.ScSpiderjob=job;
	
	}
	
	
	public void ScSpiderurls(){
		ArrayList<String>  listPageURL=listPageLinksURLCreate();
		if(listPageURL==null){
			log.append("获取链接失败！请确认相关链接采集配置！");
			this.spiderend=true;
			return ;
		}
		log.append("\n共有："+listPageURL.size() +"个列表页");
		
		Iterator <String> il=listPageURL.iterator();
		while(il.hasNext()){
			String urlfullPath=il.next();
			log.append("\n列表页采集："+urlfullPath);
			//获取列表页内容
			StringBuffer pageContent=listPageConentCreate(urlfullPath);
			if(pageContent==null){
				log.append("\n无法获取列表内容 路径："+urlfullPath);
			}
			
			//分析产品链接
			 ArrayList<String>  ArticleUrlParseList=	ArticleUrlParse(pageContent);
		   
			 //创建采集产品链接
			 ArrayList<MallScSpiderurls> ScSpiderurlsList=	linkUrlsCreate(ArticleUrlParseList);
			
			 //保存产品链接
			 saveLinkURL(ScSpiderurlsList);
			 
			 //保存到当前采集进程链接Bean
			 List<MallScSpiderurls> linkListSaved=queryUrlListByJobId();
			 saveLinkUrlToBean(linkListSaved);
			 
			
		
		}
		
		this.spiderend=true;
		
	}
	
	synchronized public void  saveLinkUrlToBean(List<MallScSpiderurls> ScSpiderurlsList){
		getArticleUrlsBean().addUrlListByParseWebFile(ScSpiderurlsList);
	}
	
	/**
	 * 列表页分析 链接
	 * @return
	 */
	public ArrayList<String> listPageLinksURLCreate(){
		log.append("\n获取列表页 listPageLinksURLCreate()");
		MallScSpiderlinkconfig lc=getLinkconfig();
		ArrayList<String>  linkpageList=new ArrayList<String> ();
		
		
		String url=lc.getSpiderPageUrl();
		if(url==null){
			log.append("链接页地址为空，无法采集，请设置链接页地址。");
			return null;
		}
		//单页采集
		if(lc.getIsMorePage()!=null&&lc.getIsMorePage()==false){
			linkpageList.add(url);
		}else if(lc.getIsMorePage()!=null&&lc.getIsMorePage()==true){
			String prefix=getListPagePathReplaceSTR();
			int count=lc.getListPageEndId()-lc.getListPageStartId()+1;
			int startpageId=lc.getListPageStartId();
			for(int i=0;i<count;i++){
				 int linkpageId=startpageId+i;
				String pageURL=url.replace(prefix, Integer.toString(linkpageId));
				linkpageList.add(pageURL);
			}
		}else if(lc.getIsMorePage()==null){
			log.append("未设置是否多链接页，按单页处理。");
			linkpageList.add(url);
		
		}
		return linkpageList;
		
	}
	/**
	 * 获取网络列表页文本
	 * @return
	 */
	public StringBuffer listPageConentCreate(String urlfullPath){
		log.append("\nlistPageConentCreate()");
		HttpFileBean listPageBean =getHttpFileBean();
		MallScSpiderjob job=	getScSpiderjob();
		log.append("\njobId"+job.getJobId()+"site:"+job.getSiteId());
		MallScSpidersites site=(MallScSpidersites)MallScSpidersitesEntryService.getInstance()
		.get(job.getSiteId());
		
		 SpiderPath p=getSpiderPath();
		
		p.setUrlHost(site.getSiteUrl());
		p.setFileEncode(getLinkconfig().getSourceEncode());
		p.setUrlPath(urlfullPath);
		p.setUrlPort( getHostPort());
		p.setEncodeSource("utf-8");
		listPageBean.setSpiderPath(p);
	
		
		
		StringBuffer fileContent=	listPageBean.getWebFileByFullPath();
		   		
		  
		  return fileContent;
		   
	
	}
	/**
	 *分析产品链接
	 * @return
	 */
	public ArrayList<String>  ArticleUrlParse(StringBuffer fileContent){
		log.append("\nArticleUrlParse()");
		Html html= getHtml() ;
		html.setHtmlContent(fileContent.toString().toLowerCase());
		
		String keyword=getLinkconfig().getKeyWord();
		
		ArrayList<String> links=html.getLinkByNamePrefix(keyword);
		
		
		if(links.size()==0){
			log.append("\n无法采集列表页链接，请重新设置采集规则。ArtileUrlSpider.ArticleUrlParse（）\n分析日志："+html.getLog().toString());
		}else{
			log.append("\n采集产品链接："+links.size()+"个");
		}
		
		
		return links;
	
		
		
	}
/*
 * 创建采集URL
 */
	public ArrayList<MallScSpiderurls> linkUrlsCreate(ArrayList<String> links){
		
		log.append("\nlinkUrlsCreate()");
		Iterator <String> il=links.iterator();
		ArrayList<MallScSpiderurls>  urlList=new ArrayList<MallScSpiderurls> ();
		
		while(il.hasNext()){
			String urlpath=il.next();
			MallScSpiderurls url=new MallScSpiderurls();
			url.setUrlPath(urlpath);
			url.setJobId(getScSpiderjob().getJobId());
			url.setFullPath(urlpath);
			urlList.add(url);
			
		}
		
		return urlList;
		
	}
	
	/**
	 *保存产品链接
	 * @return
	 */	
	public void saveLinkURL(ArrayList<MallScSpiderurls>  urls){
		Iterator<MallScSpiderurls> i=urls.iterator();
		MallScSpiderurlsEntryService bean=MallScSpiderurlsEntryService.getInstance();
		while(i.hasNext()){
			MallScSpiderurls url=i.next();
			MallScSpiderurls urlSaved=bean.queryByUrlJobId(getScSpiderjob().getJobId(), url.getUrlPath());
			if(urlSaved==null){
				System.out.println("保存链接："+url.getUrlPath());
				bean.merge(url);
			}else{
				System.out.println("不保存，已存在链接"+url.getUrlPath());
				//url=urlSaved;
				continue;
				
			}
		}
		
	}
	
	public List<MallScSpiderurls> queryUrlListByJobId(){
		MallScSpiderurlsEntryService bean=MallScSpiderurlsEntryService.getInstance();
		return bean.getListByParentId(getScSpiderjob().getJobId());
	}
	
	
	//仅采集绑定到自动采集的采集任务和网站
	public boolean checkIsNew(){
		return true;
	}
	public void setLinkconfig(MallScSpiderlinkconfig linkconfig) {
		this.linkconfig = linkconfig;
	}
	public MallScSpiderlinkconfig getLinkconfig() {
		return linkconfig;
	}
	public void setArticleUrlsBean(ArticleUrlsBean ArticleUrlsBean) {
		this.ArticleUrlsBean = ArticleUrlsBean;
	}
	public ArticleUrlsBean getArticleUrlsBean() {
		return ArticleUrlsBean;
	}
	public void setHtml(Html html) {
		this.html = html;
	}
	public Html getHtml() {
		return html;
	}
	public void setArticleHtmlProcessor(ArticleHtmlProcessor ArticleHtmlProcessor) {
		this.ArticleHtmlProcessor = ArticleHtmlProcessor;
	}
	public ArticleHtmlProcessor getArticleHtmlProcessor() {
		return ArticleHtmlProcessor;
	}


	public void setHttpFileBean(HttpFileBean httpFileBean) {
		this.httpFileBean = httpFileBean;
	}


	public HttpFileBean getHttpFileBean() {
		return httpFileBean;
	}


	public void setListPageLinksURLList(ArrayList<String> listPageLinksURLList) {
		this.listPageLinksURLList = listPageLinksURLList;
	}


	public ArrayList<String> getListPageLinksURLList() {
		return listPageLinksURLList;
	}


	public void setScSpiderjob(MallScSpiderjob ScSpiderjob) {
		this.ScSpiderjob = ScSpiderjob;
	}


	public MallScSpiderjob getScSpiderjob() {
		return ScSpiderjob;
	}


	public void setListPagePathReplaceSTR(String listPagePathReplaceSTR) {
		this.listPagePathReplaceSTR = listPagePathReplaceSTR;
	}


	public String getListPagePathReplaceSTR() {
		return listPagePathReplaceSTR;
	}
public static void main(String argv[]){
	util.DaoService.getIntrance();
	MallScSpiderjob job=(MallScSpiderjob)MallScSpiderjobEntryService.getInstance().get(3);
	ArticleUrlsBean ab=new ArticleUrlsBean();
	ArtileUrlSpider b=new ArtileUrlSpider(job,ab);
	//b.ScSpiderurls();
	b.start();
	//log.append(b.getHtml().getLog().toString());
	//log.apend(b.getHtml().getHtmlContent());
}


public void setHostPort(int hostPort) {
	this.hostPort = hostPort;
}


public int getHostPort() {
	return hostPort;
}


public void setIsworking(boolean isworking) {
	this.isworking = isworking;
}


public boolean isIsworking() {
	return isworking;
}


public void setLog(StringBuffer log) {
	this.log = log;
}


public StringBuffer getLog() {
	return log;
}


public void setSpiderPath(SpiderPath spiderPath) {
	this.spiderPath = spiderPath;
}


public SpiderPath getSpiderPath() {
	return spiderPath;
}


public void setSpiderend(boolean spiderend) {
	this.spiderend = spiderend;
}


public boolean isSpiderend() {
	return spiderend;
}
}
