⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 searchbean.java

📁 nutch搜索的改进型工具和优化爬虫的相关工具
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
			if (segmentName.equals((String) segmentNames.get(i))) {
				System.out
						.println("**********have this segment:" + segmentName);
				return true;
			}
		}
		return client.addSegment(segmentName, host, port);
	}

	public boolean delSegments(String[] segments, String host, String port)
			throws IOException {
		if (client.delSegments(segments, host, port)) {
			for (int i = 0; i < segments.length; i++) {
				int len = this.segmentNames.size();
				for (int j = 0; j < len; j++) {
					if (segments[i].equals((String) this.segmentNames.get(j))) {
						System.out.println("*******delete " + segments[i]
								+ " from segmentNames");
						this.segmentNames.remove(j);
						break;
					}
				}
			}
			return true;
		}
		return false;
	}

	public boolean mergeSegments(String[] segments, String newSegment,
			String host, String port) throws IOException {
		if (client.mergeSegments(segments, newSegment, host, port)) {
			for (int i = 0; i < segments.length; i++) {
				int len = this.segmentNames.size();
				for (int j = 0; j < len; j++) {
					if (segments[i].equals((String) this.segmentNames.get(j))) {
						System.out.println("*******delete " + segments[i]
								+ " from segmentNames");
						this.segmentNames.remove(j);
						break;
					}
				}
			}
			// File newSeg = new File(newSegment);
			this.segmentNames.add(newSegment);
			return true;
		}
		return false;
	}

	public String[] getSegmentNames() {
		String[] names = new String[segmentNames.size()];
		for (int i = 0; i < names.length; i++)
			names[i] = (String) this.segmentNames.get(i);
		return names;
	}

	/*
	 * public boolean reloadSegmentNames(String host) throws IOException{ return
	 * client.reloadSegmentNames(host); }
	 */

	public Hits search(Query query, int numHits, String mode) throws IOException {
	  	return searcher.search(query, numHits, mode); 
	}

	/**
	 * Modified by liubin. 2006-02-10
	 */
	public Hits search(Query query, int numHits, long start, long end, String mode) throws IOException {
		return searcher.search(query, numHits, 0,start, end, mode);
	}

	/************************Comment by liubin. 2006-02-10**********************
	 * sortType: 0-按相关性排序,1-按时间排序 searchFrom: 0-全文搜索,1-标题搜索
	 * 
	 * @see com.netease.search.searcher.Searcher#search(net.nutch.searcher.Query,
	 *      int, int, int)
	 */
	public Hits search(Query query, int numHits, int sortType, String mode)
	  					throws IOException {
	  	return search(query, numHits, sortType, 0, mode);
    }
	
	public Hits search(Query query, int numHits, int sortType, long start, long end, String mode) throws IOException {
		return search(query, numHits, sortType, 0, start, end, mode);
	}
	
	public Hits search(Query query, int numHits, int sortType, int groupType, String mode) throws IOException {
		return search(query, numHits, sortType, groupType, 0, 0, mode);
	}
	/**
	 * 
	 * @param query
	 * @param numHits
	 * @param sortType
	 * @param groupType 0:不折叠 1:按gid折叠 2:按cid折叠
	 * @param start
	 * @param end
	 * @return
	 * @throws IOException
	 */
	public Hits search(Query query, int numHits, int sortType,
					int groupType, long start, long end, String mode) throws IOException {
		boolean needGroup = (groupType == 1) ? true : false;
		////////////////////////////////////////////////////
		//System.out.println("//////////////////////////////");
		//System.out.println("SearchBean nutch-query=["+query.toString()+"]");
		//System.out.println("//////////////////////////////");
		Clause[] clause = query.getClauses();
		for (int i = 0; i < clause.length; i++) {
			if (clause[i].getField().equals("gid")) {
				needGroup = false;
				break;
			}
			
			if (clause[i].getField().equals("cid")){
				needGroup = true;
				groupType = 1;
				break;
			}
			////////////////////////////////////////////////////
			//System.out.println("//////////////////////////////");
			//System.out.println("SearchBean clause field=["+clause[i].getField()+"]");
			//System.out.println("//////////////////////////////");
		}

		LOG.info("SearchBean needGroup : "+needGroup);
		
		int maxHitsRaw = HITS_PER_PAGE * PAGES_PER_SEARCH
			* (numHits / HITS_PER_PAGE / PAGES_PER_SEARCH + 1);
		int numHitsRaw = (int) (maxHitsRaw * RAW_HITS_FACTOR);
		////////////////////////////////////////////////////
		//System.out.println("//////////////////////////////");
		//System.out.println("SearchBean groupType=["+groupType+"] maxHitsRaw=["+maxHitsRaw+"] numHitsRaw=["+numHitsRaw+"] MAX_RESULT_HITS=["+MAX_RESULT_HITS+"]");
		//System.out.println("//////////////////////////////");
		
		//Add by LiuBin. 2006-03-22
		if (numHitsRaw > MAX_RESULT_HITS) {
			numHitsRaw = MAX_RESULT_HITS;
		}
		
		LOG.info("searching for " + numHitsRaw + " raw hits");
		// Add by xie Shuqiang. 2006/09/19
		if (groupType == 2)
			sortType = 5;
		
		if (!needGroup){
			return searcher.search(query, numHits, sortType, start, end, mode);			
		}
		// Need group
		Hits hits = searcher.search(query, numHitsRaw, sortType, start, end, mode);
		if (groupType == 2){
			return hits;
		}
		/*** For Test Only
		if(hits.getTotal()>0)
			System.out.println("clusteId = "+hits.getHit(0).getClustNo());
        ***/
		long total = hits.getTotal();
		Map grpToHits = new HashMap();
		List resultList = new ArrayList();
		////////////////////////////////////////////////////
		//System.out.println("//////////////////////////////");
		//System.out.println("SearchBean hits.getTotal=["+total+"]");
		//System.out.println("//////////////////////////////");
		//System.out.println("----maxHitsRaw="+maxHitsRaw);
		for (int rawHitNum = 0; rawHitNum < total; rawHitNum++) {
			//System.out.println("SearchBean enter for cycle");
			if (rawHitNum == hits.getLength()) {
				if (numHitsRaw >= MAX_RESULT_HITS)
					break;
				numHitsRaw = MAX_RESULT_HITS;
				//numHitsRaw = (int) (numHitsRaw * RAW_HITS_FACTOR);
				LOG.info("re-searching for " + numHitsRaw + " raw hits");
				hits = searcher.search(query, numHitsRaw, sortType, start, end, mode);
				LOG.info("found " + hits.getTotal() + " raw hits");
				rawHitNum = -1;
				grpToHits.clear();
				resultList.clear();
				total = hits.getTotal();
				continue;
			}
		
			Hit hit = hits.getHit(rawHitNum);
			int grpNo = 0;
			if (groupType == 1)
				grpNo = hit.getGrpNo();
			//else if (groupType == 2)
			//	grpNo = hit.getClustNo();
			//else
			//	return hits;
			
			Hit grpHit = (Hit) grpToHits.get(String.valueOf(grpNo));
			if (grpHit == null) {
				//hit.setSiteRankSum(hit.getSiteRank());
				grpToHits.put(String.valueOf(grpNo), hit);
				resultList.add(new Integer(grpNo));
				if (resultList.size() > maxHitsRaw )
					break;
			} else {
				//System.out.println("hit siteRank:" + hit.getSiteRank() + " grpHit siteRank:" + grpHit.getSiteRank());
				if (hit.getSiteRank() > grpHit.getSiteRank() ||
						(hit.getSiteRank() == grpHit.getSiteRank() && hit.getSortInt() > grpHit.getSortInt())){
					hit.setGrpDocs(grpHit.getGrpDocs() + 1);
					//hit.setSiteRankSum(grpHit.getSiteRankSum() + hit.getSiteRank());
					grpToHits.put(String.valueOf(grpNo),hit);
				}else{
					grpHit.setGrpDocs(grpHit.getGrpDocs() + 1);
					//grpHit.setSiteRankSum(grpHit.getSiteRankSum() + hit.getSiteRankSum());
				}
			}
		}
		// 排重部分
		int resultListSize = resultList.size();
		Hit[] resultHits = new Hit[resultListSize];
		for (int i = 0; i < resultListSize; i++) {
			int grpNo = ((Integer) resultList.remove(0)).intValue();
			resultHits[i] = (Hit) grpToHits.get(String.valueOf(grpNo));
		}
		grpToHits.clear();
		//if (groupType == 2){// group by cluster id
		//	Arrays.sort(resultHits,new ClusterComparator());
		//}
		
		//若total<maxHitsRaw,则设置total为叠后的值
		if(total<maxHitsRaw){
			total = resultListSize;
		}//-------------
		
		Hits results = new Hits(total, resultListSize, resultHits);

		return results;
	}
	/*******************
	private class ClusterComparator implements Comparator{
		public int compare(Object o1, Object o2){
			Hit h1 = (Hit)o1;
			Hit h2 = (Hit)o2;
			
			if (h1.getGrpDocs() > 1 && h2.getGrpDocs() == 1)
				return -1;
			if (h1.getGrpDocs() == 1 && h2.getGrpDocs() > 1)
				return 1;
			
			if (h1.getSiteRankSum() < h2.getSiteRankSum())
				return -1;
			if (h1.getSiteRankSum() > h2.getSiteRankSum())
				return 1;
			if (h1.getGrpDocs() < h2.getGrpDocs())
				return -1;
			if (h1.getGrpDocs() > h2.getGrpDocs())
				return 1;
			if (h1.getGrpNo() < h2.getGrpNo())
				return -1;
			else
				return 1;
		}
	}
	****************************/
	private class SiteHits extends ArrayList {
		private boolean maxSizeExceeded;
	}

	/**
	 * Search for pages matching a query, eliminating excessive hits from sites.
	 * Hits for a site in excess of <code>maxHitsPerSite</code> are removed
	 * from the results. The remaining hits for such sites have {@link
	 * Hit#moreFromSiteExcluded()} set.
	 * <p>
	 * If maxHitsPerSite is zero then all hits are returned.
	 * 
	 * @param query
	 *            query
	 * @param numHits
	 *            number of requested hits
	 * @param maxHitsPerSite
	 *            the maximum hits returned per site, or zero
	 * @return Hits the matching hits
	 * @throws IOException
	 */
	/*
	 * public Hits search(Query query, int numHits, int maxHitsPerSite) throws
	 * IOException { if (maxHitsPerSite <= 0) // disable site checking return
	 * searcher.search(query, numHits);
	 * 
	 * int numHitsRaw = (int)(numHits * RAW_HITS_FACTOR); LOG.info("searching
	 * for "+numHitsRaw+" raw hits"); Hits hits = searcher.search(query,
	 * numHitsRaw); long total = hits.getTotal(); Map siteToHits = new
	 * HashMap(); List resultList = new ArrayList(); Set seen = new HashSet();
	 * List excludedSites = new ArrayList(); boolean totalIsExact = true; for
	 * (int rawHitNum = 0; rawHitNum < hits.getTotal(); rawHitNum++) { // get
	 * the next raw hit if (rawHitNum >= hits.getLength()) { // optimize query
	 * by prohibiting more matches on some excluded sites Query optQuery =
	 * (Query)query.clone(); for (int i = 0; i < excludedSites.size(); i++) { if
	 * (i == MAX_PROHIBITED_TERMS) break;
	 * optQuery.addProhibitedTerm(((String)excludedSites.get(i)), "site"); }
	 * numHitsRaw = (int)(numHitsRaw * RAW_HITS_FACTOR); LOG.info("re-searching
	 * for "+numHitsRaw+" raw hits, query: "+optQuery); hits =
	 * searcher.search(optQuery, numHitsRaw); LOG.info("found
	 * "+hits.getTotal()+" raw hits"); rawHitNum = 0; continue; }
	 * 
	 * Hit hit = hits.getHit(rawHitNum); if (seen.contains(hit)) continue;
	 * seen.add(hit);
	 *  // get site hits for its site String site = hit.getSite(); SiteHits
	 * siteHits = (SiteHits)siteToHits.get(site); if (siteHits == null)
	 * siteToHits.put(site, siteHits = new SiteHits());
	 *  // does this hit exceed maxHitsPerSite? if (siteHits.size() ==
	 * maxHitsPerSite) { // yes -- ignore the hit if (!siteHits.maxSizeExceeded) {
	 *  // mark prior hits with moreFromSiteExcluded for (int i = 0; i <
	 * siteHits.size(); i++) {
	 * ((Hit)siteHits.get(i)).setMoreFromSiteExcluded(true); }
	 * siteHits.maxSizeExceeded = true;
	 * 
	 * excludedSites.add(site); // exclude site } totalIsExact = false; } else { //
	 * no -- collect the hit resultList.add(hit); siteHits.add(hit);
	 *  // are we done? // we need to find one more than asked for, so that we
	 * can tell if // there are more hits to be shown if (resultList.size() >
	 * numHits) break; } }
	 * 
	 * Hits results = new Hits(total, (Hit[])resultList.toArray(new
	 * Hit[resultList.size()])); results.setTotalIsExact(totalIsExact); return
	 * results; }
	 * 
	 */
	public String getExplanation(Query query, Hit hit) throws IOException {
		return searcher.getExplanation(query, hit);
	}

	public HitDetails getDetails(Hit hit) throws IOException {
		return detailer.getDetails(hit);
	}

	public HitDetails[] getDetails(Hit[] hits) throws IOException {
		return detailer.getDetails(hits);
	}
	
	public String getSummary(HitDetails hit, Query query) throws IOException {
		return getSummaryNew(hit, query, 1);
	}

	public String getSummary(HitDetails hit, Query query, boolean fullTextSummary) throws IOException {
		return summarizer.getSummary(hit, query, fullTextSummary);
	}
	public String getSummaryNew(HitDetails hit, Query query, int summaryType) throws IOException {
		return summarizer.getSummaryNew(hit, query, summaryType);
	}
	public String[] getSummary(HitDetails[] hits, Query query) throws IOException {
		return getSummaryNew(hits, query, 1);
	}
	public String[] getSummary(HitDetails[] hits, Query query, boolean fullTextSummary)
			throws IOException {
		return summarizer.getSummary(hits, query, fullTextSummary);
	}
	
	public String[] getSummaryNew(HitDetails[] hits, Query query, int summaryType)
	throws IOException {
		
		return summarizer.getSummaryNew(hits, query, summaryType);
	}

	public byte[] getContent(HitDetails hit) throws IOException {
		return content.getContent(hit);
	}

	public ParseData getParseData(HitDetails hit) throws IOException {
		return content.getParseData(hit);
	}

	public ParseText getParseText(HitDetails hit) throws IOException {
		return content.getParseText(hit);
	}

	public String[] getAnchors(HitDetails hit) throws IOException {
		return content.getAnchors(hit);
	}

	public long getFetchDate(HitDetails hit) throws IOException {
		return content.getFetchDate(hit);
	}

	/** For debugging. */
	public static void main(String[] args) throws Exception {
		
		String usage = "NutchBean query";

		if (args.length == 0) {
			System.err.println(usage);
			System.exit(-1);
		}

		SearchBean bean = new SearchBean(DistributedSearch.MODE_RAM);
		Query query = Query.parse(args[0]);

		Hits hits = bean.search(query, 10,0,0,0, DistributedSearch.MODE_RAM);
		System.out.println("Total hits: " + hits.getTotal());
		int length = (int) Math.min(hits.getTotal(), 10);
		Hit[] show = hits.getHits(0, length);
		HitDetails[] details = bean.getDetails(show);
		String[] summaries = bean.getSummaryNew(details, query, 1);

		for (int i = 0; i < hits.getLength(); i++) {
			System.out.println(" " + i + " " + details[i]);// + "\n" +
															// summaries[i]);
		}
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -