« Not just images: using generated content in CSS | Main | Two developer positions open in Wellesley, MA »

Spider.cfc for ColdFusion

After Ray Camden posted his first-draft spider code the other week, I remembered that I have spider code that I wrote last year but haven't published yet, so here goes. My component lets you set a maximum number of links to crawl, and lets you create filters for inclusion and exclusion of specific file paths or file names. It also:

  • checks the response code of any URL before calling the full content;
  • avoids calling the same URL more than once;
  • and, returns a query containing the url, title, and contents of each page called, perfect for creating a Verity (or Solr) search index.
If anyone has comments, let me know what you think.

<cfcomponent output="false">
	<!--- spider.cfc written by Tom Mollerus (tmollerus@pingidentity.com) --->
	
	<!--- Initialize parameters --->
	<cfset VARIABLES.maxLinks = "0" />
	<cfset VARIABLES.extensions = "" />
	<cfset VARIABLES.excludeFilters = "" />
	<cflock name="qDataLock" timeout="2">
		<cfset VARIABLES.qData = QueryNew('url,title,body,itemDate', 'varchar,varchar,varchar,date') />
	</cflock>
	<cflock name="qLinkLock" timeout="2">
		<cfset VARIABLES.qLinks = QueryNew('url', 'varchar') />
	</cflock>
	
	<cffunction name="crawl">
		<cfargument name="site" default="" />
		<cfargument name="extensions" default="" />
		<cfargument name="excludeFilters" default="" />
		<cfargument name="maxLinks" default="0" />
		<cfif IsValid('URL', ARGUMENTS.site) and GetStatus(ARGUMENTS.site)>
			<!--- Check the links --->
			<cfset VARIABLES.maxLinks = Val(ARGUMENTS.maxLinks) />
			<cfset VARIABLES.excludeFilters = ARGUMENTS.excludeFilters />
			<cfset VARIABLES.extensions = ARGUMENTS.extensions />
			<cfset checkLinks(ARGUMENTS.site, ARGUMENTS.site, ARGUMENTS.extensions) />
		</cfif>
		
		<cfreturn VARIABLES.qData />
	</cffunction>
	
	<cffunction name="getStatus">
		<cfargument name="link" required="true" />
		<cfset var result = 0 />
		
		<cftry>
			<cfhttp method="head" url="#ARGUMENTS.link#" redirect="true" timeout="5"></cfhttp>
			<cfset result = Val(cfhttp.statusCode) />
			<cfcatch></cfcatch>
		</cftry>
		
		<cfreturn result />
	</cffunction>
	
	<cffunction name="shouldFollow">
		<cfargument name="link" required="true" />
		<cfargument name="domain" required="true" />
		<cfset var result = true />
		
		<cflock name="qLinkLock" timeout="2">
			<cfquery name="qHasBeenChecked" dbtype="query">
				SELECT url
				FROM VARIABLES.qLinks
				WHERE url = '#ARGUMENTS.link#'
			</cfquery>
		</cflock>
		<cfif qHasBeenChecked.recordCount>
			<cfset result = false />
		<cfelseif ARGUMENTS.link contains 'javascript:'>
			<cfset result = false />
		<cfelseif Val(VARIABLES.maxLinks) and VARIABLES.qLinks.recordCount gte Val(VARIABLES.maxLinks)>
			<cfset result = false />
		<cfelseif Left(link, Len(ARGUMENTS.domain)) neq ARGUMENTS.domain>
			<cfset result = false />
		</cfif>
		
		<cfreturn result />
	</cffunction>
	
	<cffunction name="shouldIndex">
		<cfargument name="link" required="true" />
		<cfset var result = true />
		
		<cfif ListLen(VARIABLES.extensions) and not ListFindNoCase(VARIABLES.extensions, ListLast(ListFirst(ARGUMENTS.link, '?'), '.'))>
			<cfset result = false />
		<cfelseif ListLen(VARIABLES.excludeFilters)>
			<cfloop index="filter" list="#VARIABLES.excludeFilters#" delimiters="|">
				<cfset literalFilter = Replace(filter, '*', '', 'ALL')>
				<cfif Left(filter, 1) eq '*' and Right(filter, 1) eq '*'>
					<cfif link contains literalFilter>
						<cfset result = false />
					</cfif>
				<cfelseif Right(filter, 1) eq '*'>
					<cfif Left(link, Len(literalFilter)) eq literalFilter>
						<cfset result = false />
					</cfif>
				<cfelseif Left(filter, 1) eq '*'>
					<cfif Right(link, Len(literalFilter)) eq literalFilter>
						<cfset result = false />
					</cfif>
				<cfelse>
					<cfif link eq filter>
						<cfset result = false />
					</cfif>
				</cfif>
			</cfloop>
		</cfif>
		
		<cfreturn result />
	</cffunction>
	
	<cffunction name="checkLinks">
		<cfargument name="page" required="true" />
		<cfargument name="domain" required="true" />
		<cfset var link = '' />
		
		<!--- Get the page --->
		<cfhttp method="get" url="#ARGUMENTS.page#" redirect="true" resolveurl="true" timeout="10"></cfhttp>
		<cflock name="qLinkLock" timeout="2">
			<cfset QueryAddRow(VARIABLES.qLinks) />
			<cfset QuerySetCell(VARIABLES.qLinks, 'url', ARGUMENTS.page) /><!--- Enter the link in the result query --->
		</cflock>
		<cfif Val(CFHTTP.statusCode) eq 200>
			<cfif shouldIndex(ARGUMENTS.page)>
				<cflock name="qDataLock" timeout="2">
					<cfset QueryAddRow(VARIABLES.qData) />
					<cfset QuerySetCell(VARIABLES.qData, 'url', getRelativePath(ARGUMENTS.page)) /><!--- Enter the link in the result query --->
					<cfset QuerySetCell(VARIABLES.qData, 'title', getPageTitle(CFHTTP.fileContent)) /><!--- Enter the status in the result query --->
					<cfset QuerySetCell(VARIABLES.qData, 'body', getBrowsableContent(CFHTTP.fileContent)) /><!--- Enter the status in the result query --->
					<cfset QuerySetCell(VARIABLES.qData, 'itemDate', '') /><!--- Enter the status in the result query --->
				</cflock>
			</cfif>
			
			<!--- Parse out the links in the page --->
			<cfset aLinks = ReMatchNoCase('((((https?:|ftp:)\/\/)|(www\.|ftp\.))[-[:alnum:]\?$%,\.\/\|&##!@:=\+~_]+[A-Za-z0-9\/])', StripComments(cfhttp.fileContent)) />
			
			<!--- For each of the links --->
			<cfloop index="link" array="#aLinks#">
				<!--- Strip the link of any bookmark ('#') --->
				<cfset link = Replace(ListFirst(link, '##'), ':80', '', 'ONE') />
				<!--- If the link hasn't been checked already --->
				<cfif shouldFollow(link, ARGUMENTS.domain)>
					<cfset linkStatus = GetStatus(link) />
					<!--- If the link is up and matches the domain --->
					<cfif linkStatus eq 200>
						<!--- Link check its contents as well --->
						<cfset checkLinks(link, ARGUMENTS.domain)>
					</cfif>
				</cfif>
			</cfloop>
		</cfif>
		
		<cfreturn />
	</cffunction>
	
	<cffunction name="getBrowsableContent">
		<cfargument name="string" required="true" />
		
		<cfset ARGUMENTS.string = StripComments(ARGUMENTS.string) />
		<cfset ARGUMENTS.string = ReReplaceNoCase(ARGUMENTS.string, '<script.*?>.*?</script>', '', 'ALL') />
		<cfset ARGUMENTS.string = ReReplaceNoCase(ARGUMENTS.string, '<style.*?>.*?</style>', '', 'ALL') />
		<cfset ARGUMENTS.string = ReReplace(ARGUMENTS.string, '<[^>]*>', '', 'ALL') />
		
		<cfreturn ARGUMENTS.string />
	</cffunction>
	
	<cffunction name="stripComments">
		<cfargument name="string" required="true" />
		
		<cfset ARGUMENTS.string = ReReplace(ARGUMENTS.string, '<--[^(-->)]*-->', '', 'ALL') />
		
		<cfreturn ARGUMENTS.string />
	</cffunction>
	
	<cffunction name="getPageTitle">
		<cfargument name="string" required="true" />
		
		<cfreturn ReReplace(ARGUMENTS.string, ".*<title>([^<>]*)</title>.*", "\1") />
	</cffunction>
	
	<cffunction name="getRelativePath">
		<cfargument name="path" required="true" />
		
		<cfset ARGUMENTS.path = ReplaceNoCase(ARGUMENTS.path, 'http://', '', 'ONE') />
		<cfset ARGUMENTS.path = ReplaceNoCase(ARGUMENTS.path, ListFirst(ARGUMENTS.path, '/'), '', 'ONE') />
		
		<cfreturn ARGUMENTS.path />
	</cffunction>
</cfcomponent>

Comments (2)

Most fav SW parody: "The Empire Strikes Bike" . See my url.

I may be in need of this. Thanks

Post a comment