#!/usr/local/bin/python -- ## -*-python-*- ##**************************************************************************** ## ## File: webcserver.py ## RCS: $Header: $ ## Description: DL WebCrawler Service LSP ## Author: Scott Hassan, Stanford University ## Created: ## Modified: ## Language: python ## Package: N/A ## Status: Experimental (Do Not Distribute) ## ## (c) Copyright 1995, Stanford University, all rights reserved. ## ##***************************************************************************** ## ## ## import posix, os, socket, sys import time, string, regex import ilu import urllib import ICosProperty import IDLInterchange, IDLInterchange__skel import shelve try: dllib = None import dlcoslib except ImportError: import dllib dlcoslib = dllib props = [IDLInterchange.kDocTitle, IDLInterchange.kDocAuthor, IDLInterchange.kDocAbstract, IDLInterchange.kDocDate, IDLInterchange.kItemWeight, IDLInterchange.kItemContentType, IDLInterchange.kItemContent, IDLInterchange.kItemContentLength, "url"] def PropNames(pPropertyNames): names = [] pn = [] for prop in props: v = prop in pPropertyNames pn.append(v) if v: names.append(prop) return pn, names def PropValues(pn, title, weight, ref): anItemValues = [] if pn[0]: anItemValues.append(dlcoslib.OutputAny(title)) if pn[1]: anItemValues.append(dlcoslib.OutputAny("")) # author if pn[2]: anItemValues.append(dlcoslib.OutputAny("")) # abstract if pn[3]: anItemValues.append(dlcoslib.OutputAny("")) # date if pn[4]: anItemValues.append(dlcoslib.OutputAny(weight)) if pn[5]: anItemValues.append(dlcoslib.OutputAny("text/html")) if pn[6]: anItemValues.append(dlcoslib.OutputAny("")) # content if pn[7]: anItemValues.append(dlcoslib.OutputAny(0)) # length if pn[8]: anItemValues.append(dlcoslib.OutputAny(ref)) #url return anItemValues ## class WebCrawlerSearch: def __init__(self): self.page = None self.lines = None self.results = None self.weightpat = None self.refpat = None self.titlepat = None self.index = None self.shelve = shelve.open("webcrawler.shelve") def Search(self, pPhrase, pMaxHits): # # ask htmlget to do the webcrawler search. # qphrase = urllib.quote(pPhrase) pURL = "http://webcrawler.com/cgi-bin/WebQuery?maxHits=" + `pMaxHits` + "&" + "searchText=" + qphrase if self.shelve.has_key(pURL): ## print "cache hit" self.page = self.shelve[pURL] else: ## print "load url" self.page = urllib.urlopen(pURL).read() self.shelve[pURL] = self.page self.lines = string.splitfields(self.page, "\n") self.index = 0 # create a new result container self.results = [] # set up the patterns. self.weightpat = regex.compile('\([0-9]+\)[ ]') self.refpat = regex.compile('href="\([^"]+\)"') self.titlepat = regex.compile('">\([^<]+\)') def Hits(self): # # check for no resulting set. # nodocs_pattern = 'found no documents!' if(regex.search(nodocs_pattern, self.page) >= 0): return 0 pat = 'found \([0-9]+\) documents and returned \([0-9]+\)' rpat = regex.compile(pat) nhits = 0 rhits = 0 if(rpat.search(self.page) >= 0): nhits, rhits = rpat.group(1,2) nhits = string.atoi(nhits) rhits = string.atoi(rhits) return rhits def GetItem(self, pWhich): if len(self.results) > pWhich: return self.results[pWhich] else: while self.index < len(self.lines): line = self.lines[self.index] self.index = self.index + 1 if (regex.search("",line) >= 0): title = "" ref = "" weight = 0 if(self.weightpat.search(line)>=0): weight = string.atoi(self.weightpat.group(1)) if(self.refpat.search(line)>=0): ref = self.refpat.group(1) if(self.titlepat.search(line)>=0): title = self.titlepat.group(1) self.results.append((title, ref, weight, line)) return (title, ref, weight, line) return None ## ## class WebCrawlerResultCollection(IDLInterchange__skel.CCollection, dlcoslib.IPropertySetImpl): def __init__(self, pService, pModule, pSearch): dlcoslib.IPropertySetImpl.__init__(self) self.module = pModule self.fService = pService self.fSearch = pSearch self.fTotalItems = None ## ------------------------------------------------------------ def GetTotalItems(self): dlcoslib.LogMessage(self.module, "GetTotalItems()", 1) return self.fTotalItems ## ------------------------------------------------------------ def SetTotalItems(self, pTotalItems): dlcoslib.LogMessage(self.module, "SetTotalItems(" + `pTotalItems` + ")", 1) self.fTotalItems = pTotalItems ## ------------------------------------------------------------ def GetItems(self, pPropertyNames, pCookie, pNumberOfItems): dlcoslib.LogMessage(self.module, "GetItems(" + `pPropertyNames` + ", " + `pCookie` + ", " + `pNumberOfItems` + ")", 1) if pCookie == None: aStart = 0 else: aStart = dlcoslib.InputAny(pCookie) aEnd = pNumberOfItems + aStart if aEnd > self.fTotalItems: aEnd = self.fTotalItems anItemsValues = [] anAccessCapabilities = [] pn, names = PropNames(pPropertyNames) anItemsState = {} anItemsState["aNames"] = names anItemsState["anItemsState"] = anItemsValues anItemsACState = {} anItemsACState["aACs"] = anAccessCapabilities anItemsACState["aState"] = anItemsState pMoreCookie = [] for i in range(aStart, aEnd): item = self.fSearch.GetItem(i) if item: title, ref, weight, line = item anItemValues = PropValues(pn, title, weight, ref) anItemACs = [] anAccessCapabilities.append(anItemACs) anItemsValues.append(anItemValues) anItemACs.append({"aTarget" : self, "aCookie" : dlcoslib.OutputAny(i), "anItemClass" : IDLInterchange.kDocument, "aHints" : []}) anItemACs.append({"aTarget" : None, "aCookie" : dlcoslib.OutputAny(ref), "anItemClass" : "www", "aHints" : []}) if aEnd == self.fTotalItems: aCookie = None else: aCookie = dlcoslib.OutputAny(aEnd) pMoreCookie.append({"aTarget" : self, "aCookie" : aCookie, "anItemClass" : "", "aHints" : []}) ## print anItems ## print anItemsACState return anItemsACState, pMoreCookie ## ------------------------------------------------------------ def RequestItems(self, pMessageID, pPropertyNames, pCookie, pNumberOfItems, pResultTarget): anItems, pMoreCookie = self.GetItems(pPropertyNames,pCookie,pNumberOfItems) pResultTarget.AddItems(pMessageID, anItems, pMoreCookie) pResultTarget.CompletedRequest(pMessageID) return ## ------------------------------------------------------------ def GetItemsProperties(self, pPropertyNames, pServerCookies): dlcoslib.LogMessage(self.module, "GetItemsProperties(" + `pPropertyNames` + ", " + `pServerCookies` + ")", 1) anItemsValues = [] pn, names = PropNames(pPropertyNames) anItemsState = {} anItemsState["aNames"] = names anItemsState["anItemsState"] = anItemsValues for cookie in pServerCookies: itemno = dlcoslib.InputAny(cookie) item = self.fSearch.GetItem(itemno) if item: title, ref, weight, line = item anItemValues = PropValues(pn, title, weight, ref) anItemsValues.append(anItemValues) return anItemsState def RequestItemsProperties(self, pMessageID, pPropertyNames, pServerCookies, pClientCookies, pResultTarget): anItemsValues = [] pn, names = PropNames(pPropertyNames) anItemsState = {} anItemsState["aNames"] = names anItemsState["anItemsState"] = anItemsValues anItemsCookieState = {} aClientCookies = [] anItemsCookieState["aClientCookies"] = aClientCookies anItemsCookieState["aState"] = anItemsState anIndex = 0 for cookie in pServerCookies: itemno = dlcoslib.InputAny(cookie) item = self.fSearch.GetItem(itemno) if item: title, ref, weight, line = item anItemValues = [] anItemsValues.append(anItemValues) anItemValues = PropValues(pn, title, weight, ref) aClientCookies.append(pClientCookies[anIndex]) anIndex = anIndex + 1 pResultTarget.SetItemsProperties(pMessageID, anItemsCookieState) pResultTarget.CompletedRequest(pMessageID) return ## ## ************************************************************************** ## class WebCrawler(IDLInterchange__skel.CConstrainCollection, dlcoslib.IPropertySetImpl): def __init__(self, pModule): self.module = pModule pass def RequestConstrain(self, pMessageID, pQuerySummary, pServicePrefSummary, pResultTarget): dlcoslib.LogMessage(self.module, "RequestConstrain(" + `pMessageID` + ', ' + `pQuerySummary` + ', ' + `pServicePrefSummary` + ")", 1) aSearch = WebCrawlerSearch() n = pServicePrefSummary["aNumberOfItems"] if n < 100: n = 100 aSearch.Search(string.join(pQuerySummary["aQueryComponents"]), n) pTotalSize = aSearch.Hits() pResultTarget.SetTotalItems(pTotalSize) if pTotalSize == 0: dlcoslib.LogMessage(self.module, "Result set = 0", 1) return aResult = WebCrawlerResultCollection(self, self.module, aSearch) aResult.SetTotalItems(pTotalSize) aResult.RequestItems(pMessageID, pQuerySummary["aItemPropertyNames"], None, pServicePrefSummary["aNumberOfItems"], pResultTarget) return ## ------------------------------------------------------------ def Constrain(self, pQuerySummary, pServicePrefSummary): dlcoslib.LogMessage(self.module, "Constrain(" + `pQuerySummary` + ', ' + `pServicePrefSummary` + ")", 1) aSearch = WebCrawlerSearch() n = pServicePrefSummary["aNumberOfItems"] if n < 100: n = 100 aSearch.Search(string.join(pQuerySummary["aQueryComponents"]), n) pTotalSize = aSearch.Hits() if pTotalSize == 0: return None, 0, None aResult = WebCrawlerResultCollection(self, self.module, aSearch) aResult.SetTotalItems(pTotalSize) anItems, pMoreCookie = aResult.GetItems(pQuerySummary["aItemPropertyNames"], None, pServicePrefSummary["aNumberOfItems"]) ## print pTotalSize ## print pMoreCookie return anItems, pTotalSize, pMoreCookie ## ## Main ## def main(argc, argv): if not dllib: server = ilu.CreateServer(None, None, None); else: server = ilu.CreateServer("Service", "tcp_localhost_3000", "sunrpc_2_0x31000400_4272011225"); service_name = "WebCrawler" if not dllib: args = dlcoslib.ParseStandardArgs() if args.has_key("name"): n = args["name"] service_name = n[0] service_name = "Services/" + service_name ## set up the search service service = WebCrawler(service_name) if not dllib: dlcoslib.Publish(service_name, service) else: service.IluInstHandle = 1 dlcoslib.EnableFactoryClient(service) dlcoslib.RunMainLoop() if __name__ == '__main__': main(len(sys.argv), sys.argv)