After Ray Camden posted his first-draft spider code the other week, I remembered that I have spider code that I wrote last year but haven't published yet, so here goes. My component lets you set a maximum number of links to crawl, and lets you create filters for inclusion and exclusion of specific file paths or file names. It also:
- checks the response code of any URL before calling the full content;
- avoids calling the same URL more than once;
- and, returns a query containing the url, title, and contents of each page called, perfect for creating a Verity (or Solr) search index.
<cfcomponent output="false">
<!--- spider.cfc written by Tom Mollerus (tmollerus@pingidentity.com) --->
<!--- Initialize parameters --->
<cfset VARIABLES.maxLinks = "0" />
<cfset VARIABLES.extensions = "" />
<cfset VARIABLES.excludeFilters = "" />
<cflock name="qDataLock" timeout="2">
<cfset VARIABLES.qData = QueryNew('url,title,body,itemDate', 'varchar,varchar,varchar,date') />
</cflock>
<cflock name="qLinkLock" timeout="2">
<cfset VARIABLES.qLinks = QueryNew('url', 'varchar') />
</cflock>
<cffunction name="crawl">
<cfargument name="site" default="" />
<cfargument name="extensions" default="" />
<cfargument name="excludeFilters" default="" />
<cfargument name="maxLinks" default="0" />
<cfif IsValid('URL', ARGUMENTS.site) and GetStatus(ARGUMENTS.site)>
<!--- Check the links --->
<cfset VARIABLES.maxLinks = Val(ARGUMENTS.maxLinks) />
<cfset VARIABLES.excludeFilters = ARGUMENTS.excludeFilters />
<cfset VARIABLES.extensions = ARGUMENTS.extensions />
<cfset checkLinks(ARGUMENTS.site, ARGUMENTS.site, ARGUMENTS.extensions) />
</cfif>
<cfreturn VARIABLES.qData />
</cffunction>
<cffunction name="getStatus">
<cfargument name="link" required="true" />
<cfset var result = 0 />
<cftry>
<cfhttp method="head" url="#ARGUMENTS.link#" redirect="true" timeout="5"></cfhttp>
<cfset result = Val(cfhttp.statusCode) />
<cfcatch></cfcatch>
</cftry>
<cfreturn result />
</cffunction>
<cffunction name="shouldFollow">
<cfargument name="link" required="true" />
<cfargument name="domain" required="true" />
<cfset var result = true />
<cflock name="qLinkLock" timeout="2">
<cfquery name="qHasBeenChecked" dbtype="query">
SELECT url
FROM VARIABLES.qLinks
WHERE url = '#ARGUMENTS.link#'
</cfquery>
</cflock>
<cfif qHasBeenChecked.recordCount>
<cfset result = false />
<cfelseif ARGUMENTS.link contains 'javascript:'>
<cfset result = false />
<cfelseif Val(VARIABLES.maxLinks) and VARIABLES.qLinks.recordCount gte Val(VARIABLES.maxLinks)>
<cfset result = false />
<cfelseif Left(link, Len(ARGUMENTS.domain)) neq ARGUMENTS.domain>
<cfset result = false />
</cfif>
<cfreturn result />
</cffunction>
<cffunction name="shouldIndex">
<cfargument name="link" required="true" />
<cfset var result = true />
<cfif ListLen(VARIABLES.extensions) and not ListFindNoCase(VARIABLES.extensions, ListLast(ListFirst(ARGUMENTS.link, '?'), '.'))>
<cfset result = false />
<cfelseif ListLen(VARIABLES.excludeFilters)>
<cfloop index="filter" list="#VARIABLES.excludeFilters#" delimiters="|">
<cfset literalFilter = Replace(filter, '*', '', 'ALL')>
<cfif Left(filter, 1) eq '*' and Right(filter, 1) eq '*'>
<cfif link contains literalFilter>
<cfset result = false />
</cfif>
<cfelseif Right(filter, 1) eq '*'>
<cfif Left(link, Len(literalFilter)) eq literalFilter>
<cfset result = false />
</cfif>
<cfelseif Left(filter, 1) eq '*'>
<cfif Right(link, Len(literalFilter)) eq literalFilter>
<cfset result = false />
</cfif>
<cfelse>
<cfif link eq filter>
<cfset result = false />
</cfif>
</cfif>
</cfloop>
</cfif>
<cfreturn result />
</cffunction>
<cffunction name="checkLinks">
<cfargument name="page" required="true" />
<cfargument name="domain" required="true" />
<cfset var link = '' />
<!--- Get the page --->
<cfhttp method="get" url="#ARGUMENTS.page#" redirect="true" resolveurl="true" timeout="10"></cfhttp>
<cflock name="qLinkLock" timeout="2">
<cfset QueryAddRow(VARIABLES.qLinks) />
<cfset QuerySetCell(VARIABLES.qLinks, 'url', ARGUMENTS.page) /><!--- Enter the link in the result query --->
</cflock>
<cfif Val(CFHTTP.statusCode) eq 200>
<cfif shouldIndex(ARGUMENTS.page)>
<cflock name="qDataLock" timeout="2">
<cfset QueryAddRow(VARIABLES.qData) />
<cfset QuerySetCell(VARIABLES.qData, 'url', getRelativePath(ARGUMENTS.page)) /><!--- Enter the link in the result query --->
<cfset QuerySetCell(VARIABLES.qData, 'title', getPageTitle(CFHTTP.fileContent)) /><!--- Enter the status in the result query --->
<cfset QuerySetCell(VARIABLES.qData, 'body', getBrowsableContent(CFHTTP.fileContent)) /><!--- Enter the status in the result query --->
<cfset QuerySetCell(VARIABLES.qData, 'itemDate', '') /><!--- Enter the status in the result query --->
</cflock>
</cfif>
<!--- Parse out the links in the page --->
<cfset aLinks = ReMatchNoCase('((((https?:|ftp:)\/\/)|(www\.|ftp\.))[-[:alnum:]\?$%,\.\/\|#!@:=\+~_]+[A-Za-z0-9\/])', StripComments(cfhttp.fileContent)) />
<!--- For each of the links --->
<cfloop index="link" array="#aLinks#">
<!--- Strip the link of any bookmark ('#') --->
<cfset link = Replace(ListFirst(link, '##'), ':80', '', 'ONE') />
<!--- If the link hasn't been checked already --->
<cfif shouldFollow(link, ARGUMENTS.domain)>
<cfset linkStatus = GetStatus(link) />
<!--- If the link is up and matches the domain --->
<cfif linkStatus eq 200>
<!--- Link check its contents as well --->
<cfset checkLinks(link, ARGUMENTS.domain)>
</cfif>
</cfif>
</cfloop>
</cfif>
<cfreturn />
</cffunction>
<cffunction name="getBrowsableContent">
<cfargument name="string" required="true" />
<cfset ARGUMENTS.string = StripComments(ARGUMENTS.string) />
<cfset ARGUMENTS.string = ReReplaceNoCase(ARGUMENTS.string, '<script.*?>.*?</script>', '', 'ALL') />
<cfset ARGUMENTS.string = ReReplaceNoCase(ARGUMENTS.string, '<style.*?>.*?</style>', '', 'ALL') />
<cfset ARGUMENTS.string = ReReplace(ARGUMENTS.string, '<[^>]*>', '', 'ALL') />
<cfreturn ARGUMENTS.string />
</cffunction>
<cffunction name="stripComments">
<cfargument name="string" required="true" />
<cfset ARGUMENTS.string = ReReplace(ARGUMENTS.string, '<--[^(-->)]*-->', '', 'ALL') />
<cfreturn ARGUMENTS.string />
</cffunction>
<cffunction name="getPageTitle">
<cfargument name="string" required="true" />
<cfreturn ReReplace(ARGUMENTS.string, ".*<title>([^<>]*)</title>.*", "\1") />
</cffunction>
<cffunction name="getRelativePath">
<cfargument name="path" required="true" />
<cfset ARGUMENTS.path = ReplaceNoCase(ARGUMENTS.path, 'http://', '', 'ONE') />
<cfset ARGUMENTS.path = ReplaceNoCase(ARGUMENTS.path, ListFirst(ARGUMENTS.path, '/'), '', 'ONE') />
<cfreturn ARGUMENTS.path />
</cffunction>
</cfcomponent>

Comments (2)
February 20, 2011
10:23PM | #
Most fav SW parody: "The Empire Strikes Bike" . See my url.
June 9, 2011
12:05PM | #
I may be in need of this. Thanks