Virtuoso Open-Source Edition

  • Topic
  • Discussion
  • Example of Store and Extract functions for Setting up Crawler Jobs for Semantic Sitemaps -- a variation of standard sitemap

    Example of Extract Function

    use WS;
    
    create procedure WS.WS.SITEMAP_BB_PARSE (
      in _host varchar, 
      in _url varchar, 
      in _root varchar, 
      inout _content varchar,  
      in _c_type varchar := null, 
      in lev int := 0))
    {
      --pl_debug+
      declare xt, xp, graph any;
      declare inx int;
    
    --  dbg_obj_print ('WS.WS.GET_URLS_SITEMAP', _url);
    
      declare exit handler for sqlstate '*'
        {
    --      dbg_obj_print (__SQL_MESSAGE);
          return;
        };
    
      if (_url like '%.xml.gz')
        {
          _content := gzip_uncompress (_content); 
        }
    
      if (_url like '%.xml' or _url like '%.xml.gz' or _url like '%.rdf')
        {
          xt := xtree_doc (_content);
          if (xpath_eval ('/urlset/dataset', xt) is not null)
    	{
    	  xp := xpath_eval ('/urlset/dataset/dataDumpLocation/text()', xt, 0);
    	  graph := cast (xpath_eval ('/urlset/dataset/datasetURI/text()', xt) as varchar);
    	  if (length (graph))
    	    update VFS_SITE set VS_UDATA = serialize (vector ('graph', graph)) where VS_HOST = _host and VS_ROOT = _root;
    	  inx := 0;
    	  foreach (any u in xp) do
    	    {
    	      declare hf, host, url varchar;
    
    	      u := cast (u as varchar);
    	      hf := WS.WS.PARSE_URI (u);
    	      host := hf[1];
    	      --dbg_obj_print ('WS.WS.GET_URLS_SITEMAP PARSE', u);
    	      url := hf[2];
    	      insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) 
    		  values (host, now (), url, 'waiting', _root, NULL); 
    	      if (row_count () = 0)
    		update VFS_QUEUE set VQ_STAT = 'waiting', VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url;
    	      inx := inx + 1;
    	    }
    	}
          if (xpath_eval ('/sitemapindex/sitemap/loc', xt) is not null)
    	{
    	  xp := xpath_eval ('/sitemapindex/sitemap/loc/text()', xt, 0);
    	  inx := 0;
    	  foreach (any u in xp) do
    	    {
    	      declare hf, host, url varchar;
    
    	      u := trim (cast (u as varchar));
    	      hf := WS.WS.PARSE_URI (u);
    	      host := hf[1];
    --	      dbg_obj_print ('WS.WS.GET_URLS_SITEMAP', host, _host);
    	      url := hf[2];
    	      if (url <> '')
    		{
    		  insert soft VFS_QUEUE (VQ_HOST, VQ_TS, VQ_URL, VQ_STAT, VQ_ROOT, VQ_OTHER) 
    		      values (host, now (), url, 'waiting', _root, NULL); 
    		  if (row_count () = 0)
    		    update VFS_QUEUE set VQ_STAT = 'waiting', VQ_TS = now () where VQ_HOST = host and VQ_ROOT = _root and VQ_URL = url;
    		  inx := inx + 1;
    		}
    	    }
    	}
        }
      commit work;
    }
    ;
    
    

    Example of Store Function

    use WS;
    
    create procedure WS.WS.SITEMAP_BB_STORE (
      in _host varchar, 
      in _url varchar, 
      in _root varchar,
      inout _content varchar, 
      in _s_etag varchar, 
      in _c_type varchar,
      in store_flag int := 1, 
      in udata any := null,
      in lev int := 0)
    {
      --pl_debug+
      declare graph varchar;
    
    --  dbg_obj_print ('WS.WS.SITEMAP_BB_STORE', _url, udata);
      if (isarray (udata))
        graph := get_keyword ('graph', udata);
      else  
        graph := null;
    
      if (graph is not null and _url like '%.rdf')
        {
          DB.DBA.RDF_LOAD_RDFXML (_content, graph, graph);
          DB.DBA.VT_INC_INDEX_DB_DBA_RDF_OBJ ();
        }
      insert soft VFS_URL (VU_HOST, VU_URL, VU_CHKSUM, VU_CPTIME, VU_ETAG, VU_ROOT)
          values (_host, _url, md5 (_content), now (), _s_etag, _root);
      if (row_count () = 0)
        update VFS_URL set VU_CHKSUM = md5 (_content), VU_CPTIME = now (), VU_ETAG = _s_etag where
    	VU_HOST = _host and VU_URL = _url and VU_ROOT = _root;
      commit work;
    }
    ;
    

    Related