%META:TOPICPARENT{name="VirtSetCrawlerJobsGuide"}% ---+Example of Store and Extract functions for Setting up Crawler Jobs for Semantic Sitemaps -- a variation of standard sitemap %TOC% ---++Example of Extract Function use WS; create procedure WS.WS.SITEMAP_BB_PARSE ( in _host varchar, in _url varchar, in _root varchar, inout _content varchar, in _c_type varchar := null, in lev int := 0)) { --pl_debug+ declare xt, xp, graph any; declare inx int; -- dbg_obj_print ('WS.WS.GET_URLS_SITEMAP', _url); declare exit handler for sqlstate '*' { -- dbg_obj_print (__SQL_MESSAGE); return; }; if (_url like '%.xml.gz') { _content := gzip_uncompress (_content); } if (_url like '%.xml' or _url like '%.xml.gz' or _url like '%.rdf') { xt := xtree_doc (_content); if (xpath_eval ('/urlset/dataset', xt) is not null) { xp := xpath_eval ('/urlset/dataset/dataDumpLocation/text()', xt, 0); graph := cast (xpath_eval ('/urlset/dataset/datasetURI/text()', xt) as varchar); if (length (graph)) update VFS_SITE set VS_UDATA = serialize (vector ('graph', graph)) where VS_HOST = _host and VS_ROOT = _root; inx := 0; foreach (any u in xp) do { declare hf, host, url varchar; u := cast (u as varchar); hf := WS.WS.PARSE_URI (u); host := hf[1]; --dbg_obj_print ('WS.WS.GET_URLS_SITEMAP PARSE', u); url := hf[2]; insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) values (host, now (), url, 'waiting', _root, NULL); if (row_count () = 0) update VFS_QUEUE set VQ_STAT = 'waiting', VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url; inx := inx + 1; } } if (xpath_eval ('/sitemapindex/sitemap/loc', xt) is not null) { xp := xpath_eval ('/sitemapindex/sitemap/loc/text()', xt, 0); inx := 0; foreach (any u in xp) do { declare hf, host, url varchar; u := trim (cast (u as varchar)); hf := WS.WS.PARSE_URI (u); host := hf[1]; -- dbg_obj_print ('WS.WS.GET_URLS_SITEMAP', host, _host); url := hf[2]; if (url <> '') { insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) values (host, now (), url, 'waiting', _root, NULL); if (row_count () = 0) update VFS_QUEUE set VQ_STAT = 'waiting', VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url; inx := inx + 1; } } } } commit work; } ; ---++Example of Store Function use WS; create procedure WS.WS.SITEMAP_BB_STORE ( in _host varchar, in _url varchar, in _root varchar, inout _content varchar, in _s_etag varchar, in _c_type varchar, in store_flag int := 1, in udata any := null, in lev int := 0) { --pl_debug+ declare graph varchar; -- dbg_obj_print ('WS.WS.SITEMAP_BB_STORE', _url, udata); if (isarray (udata)) graph := get_keyword ('graph', udata); else graph := null; if (graph is not null and _url like '%.rdf') { DB.DBA.RDF_LOAD_RDFXML (_content, graph, graph); DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ (); } insert soft VFS_URL (VU_HOST, VU_URL, VU_CHKSUM, VU_CPTIME, VU_ETAG, VU_ROOT) values (_host, _url, md5 (_content), now (), _s_etag, _root); if (row_count () = 0) update VFS_URL set VU_CHKSUM = md5 (_content), VU_CPTIME = now (), VU_ETAG = _s_etag where VU_HOST = _host and VU_URL = _url and VU_ROOT = _root; commit work; } ; ---++Related * [[http://docs.openlinksw.com/virtuoso/rdfinsertmethods.html#rdfinsertmethodvirtuosocrawler][Setting up a Content Crawler Job to Import Linked Data into the Virtuoso Quad Store]] * [[VirtSetCrawlerJobsGuideSitemaps][Setting up a Content Crawler Job to Retrieve Sitemaps]] (when the source includes RDFa) * [[VirtSetCrawlerJobsGuideSemanticSitemaps][Setting up a Content Crawler Job to Retrieve Semantic Sitemaps]] (a variation of the standard sitemap) * [[VirtSetCrawlerJobsGuideDirectories][Setting up a Content Crawler Job to Retrieve Content from Specific Directories]] * [[VirtCrawlerGuideAtom][Setting up a Content Crawler Job to Retrieve Content from ATOM feed]] * [[VirtCrawlerSPARQLEndpoints][Setting up a Content Crawler Job to Retrieve Content from SPARQL endpoint]]