<docbook><section><title>VirtSetCrawlerJobsGuideSemanticSitemapsFuncExample</title><title>Example of Store and Extract functions for Setting up Crawler Jobs for Semantic Sitemaps -- a variation of standard sitemap</title>Example of Store and Extract functions for Setting up Crawler Jobs for Semantic Sitemaps -- a variation of standard sitemap
<bridgehead class="http://www.w3.org/1999/xhtml:h2">Example of Extract Function</bridgehead>
<para> </para>
<programlisting>use WS;

create procedure WS.WS.SITEMAP_BB_PARSE (
  in _host varchar, 
  in _url varchar, 
  in _root varchar, 
  inout _content varchar,  
  in _c_type varchar := null, 
  in lev int := 0))
{
  --pl_debug+
  declare xt, xp, graph any;
  declare inx int;

--  dbg_obj_print (&#39;WS.WS.GET_URLS_SITEMAP&#39;, _url);

  declare exit handler for sqlstate &#39;*&#39;
    {
--      dbg_obj_print (__SQL_MESSAGE);
      return;
    };

  if (_url like &#39;%.xml.gz&#39;)
    {
      _content := gzip_uncompress (_content); 
    }

  if (_url like &#39;%.xml&#39; or _url like &#39;%.xml.gz&#39; or _url like &#39;%.rdf&#39;)
    {
      xt := xtree_doc (_content);
      if (xpath_eval (&#39;/urlset/dataset&#39;, xt) is not null)
	{
	  xp := xpath_eval (&#39;/urlset/dataset/dataDumpLocation/text()&#39;, xt, 0);
	  graph := cast (xpath_eval (&#39;/urlset/dataset/datasetURI/text()&#39;, xt) as varchar);
	  if (length (graph))
	    update VFS_SITE set VS_UDATA = serialize (vector (&#39;graph&#39;, graph)) where VS_HOST = _host and VS_ROOT = _root;
	  inx := 0;
	  foreach (any u in xp) do
	    {
	      declare hf, host, url varchar;

	      u := cast (u as varchar);
	      hf := WS.WS.PARSE_URI (u);
	      host := hf[1];
	      --dbg_obj_print (&#39;WS.WS.GET_URLS_SITEMAP PARSE&#39;, u);
	      url := hf[2];
	      insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) 
		  values (host, now (), url, &#39;waiting&#39;, _root, NULL); 
	      if (row_count () = 0)
		update VFS_QUEUE set VQ_STAT = &#39;waiting&#39;, VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url;
	      inx := inx + 1;
	    }
	}
      if (xpath_eval (&#39;/sitemapindex/sitemap/loc&#39;, xt) is not null)
	{
	  xp := xpath_eval (&#39;/sitemapindex/sitemap/loc/text()&#39;, xt, 0);
	  inx := 0;
	  foreach (any u in xp) do
	    {
	      declare hf, host, url varchar;

	      u := trim (cast (u as varchar));
	      hf := WS.WS.PARSE_URI (u);
	      host := hf[1];
--	      dbg_obj_print (&#39;WS.WS.GET_URLS_SITEMAP&#39;, host, _host);
	      url := hf[2];
	      if (url &lt;&gt; &#39;&#39;)
		{
		  insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) 
		      values (host, now (), url, &#39;waiting&#39;, _root, NULL); 
		  if (row_count () = 0)
		    update VFS_QUEUE set VQ_STAT = &#39;waiting&#39;, VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url;
		  inx := inx + 1;
		}
	    }
	}
    }
  commit work;
}
;

</programlisting><bridgehead class="http://www.w3.org/1999/xhtml:h2">Example of Store Function</bridgehead>
<para> </para>
<programlisting>use WS;

create procedure WS.WS.SITEMAP_BB_STORE (
  in _host varchar, 
  in _url varchar, 
  in _root varchar,
  inout _content varchar, 
  in _s_etag varchar, 
  in _c_type varchar,
  in store_flag int := 1, 
  in udata any := null,
  in lev int := 0)
{
  --pl_debug+
  declare graph varchar;

--  dbg_obj_print (&#39;WS.WS.SITEMAP_BB_STORE&#39;, _url, udata);
  if (isarray (udata))
    graph := get_keyword (&#39;graph&#39;, udata);
  else  
    graph := null;

  if (graph is not null and _url like &#39;%.rdf&#39;)
    {
      DB.DBA.RDF_LOAD_RDFXML (_content, graph, graph);
      DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ ();
    }
  insert soft VFS_URL (VU_HOST, VU_URL, VU_CHKSUM, VU_CPTIME, VU_ETAG, VU_ROOT)
      values (_host, _url, md5 (_content), now (), _s_etag, _root);
  if (row_count () = 0)
    update VFS_URL set VU_CHKSUM = md5 (_content), VU_CPTIME = now (), VU_ETAG = _s_etag where
	VU_HOST = _host and VU_URL = _url and VU_ROOT = _root;
  commit work;
}
;
</programlisting><bridgehead class="http://www.w3.org/1999/xhtml:h2">Related</bridgehead>
<itemizedlist mark="bullet" spacing="compact"><listitem><ulink url="http://docs.openlinksw.com/virtuoso/rdfinsertmethods.html#rdfinsertmethodvirtuosocrawler">Setting up a Content Crawler Job to Import Linked Data into the Virtuoso Quad Store</ulink> </listitem>
<listitem><ulink url="VirtSetCrawlerJobsGuideSitemaps">Setting up a Content Crawler Job to Retrieve Sitemaps</ulink> (when the source includes RDFa) </listitem>
<listitem><ulink url="VirtSetCrawlerJobsGuideSemanticSitemaps">Setting up a Content Crawler Job to Retrieve Semantic Sitemaps</ulink> (a variation of the standard sitemap) </listitem>
<listitem><ulink url="VirtSetCrawlerJobsGuideDirectories">Setting up a Content Crawler Job to Retrieve Content from Specific Directories</ulink> </listitem>
<listitem><ulink url="VirtCrawlerGuideAtom">Setting up a Content Crawler Job to Retrieve Content from ATOM feed</ulink> </listitem>
<listitem><ulink url="VirtCrawlerSPARQLEndpoints">Setting up a Content Crawler Job to Retrieve Content from SPARQL endpoint</ulink> </listitem>
</itemizedlist></section></docbook>