<docbook><section><title>VOSLODData</title><bridgehead class="http://www.w3.org/1999/xhtml:h2">Loading data to LOD instance</bridgehead>
<para>To load the RDF data to the LOD instance, perform the following steps:</para>
<orderedlist spacing="compact"><listitem>Configure &amp; start cluster </listitem>
<listitem>Execute the file, clrdfinx2.sql: <programlisting>--
--  $Id: clrdfinx2.sql,v 1.1.2.5 2009-05-07 08:49:49 mitko Exp $
--
--  Alternate RDF index scheme for cases where G unspecified
--
--  This file is part of the OpenLink Software Virtuoso Open-Source (VOS)
--  project.
--
--  Copyright (C) 1998-2009 OpenLink Software
--
--  This project is free software; you can redistribute it and/or modify it
--  under the terms of the GNU General Public License as published by the
--  Free Software Foundation; only version 2 of the License, dated June 1991.
--
--  This program is distributed in the hope that it will be useful, but
--  WITHOUT ANY WARRANTY; without even the implied warranty of
--  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
--  General Public License for more details.
--
--  You should have received a copy of the GNU General Public License along
--  with this program; if not, write to the Free Software Foundation, Inc.,
--  51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
--
--

DROP INDEX RDF_QUAD_OGPS;
CHECKPOINT;
CREATE TABLE R2 
   (
     G  iri_id_8, 
     S  iri_id_8, 
     P  iri_id_8, 
     O  ANY, 
     PRIMARY KEY  (S, P, O, G)
   )
ALTER INDEX R2 
  ON R2 
  PARTITION ( S  INT (0hexffff00) );

LOG_ENABLE (2);
INSERT INTO R2 (G, S, P, O) SELECT G, S, P, O FROM rdf_quad;

DROP TABLE RDF_QUAD;
ALTER TABLE  r2  RENAME  RDF_QUAD;
CHECKPOINT;
CREATE BITMAP INDEX  RDF_QUAD_OPGS 
  ON  RDF_QUAD (O, P, G, S) 
  PARTITION ( O  VARCHAR (-1, 0hexffff) );
CREATE BITMAP INDEX  RDF_QUAD_POGS 
  ON RDF_QUAD (P, O, G, S) 
  PARTITION ( O  VARCHAR (-1, 0hexffff) );
CREATE BITMAP INDEX  RDF_QUAD_GPOS 
  ON RDF_QUAD (G, P, O, S)
  PARTITION ( O  VARCHAR (-1, 0hexffff) );

CHECKPOINT;
</programlisting></listitem>
<listitem>Execute the file: <programlisting>CREATE TABLE load_list 
  (
    ll_file       VARCHAR,
    ll_graph      VARCHAR,
    ll_state      INT  DEFAULT 0,   -- 0 not started, 1 going, 2 done
    ll_started    DATETIME,
    ll_done       DATETIME,
    ll_host       INT,
    ll_work_time  INTEGER,
    ll_error      VARCHAR ,
    PRIMARY KEY (ll_file)
  )
ALTER INDEX load_list 
  ON load_list 
  PARTITION (ll_file VARCHAR)
;

CREATE INDEX ll_state 
  ON load_list 
       ( 
         ll_state, 
         ll_file, 
         ll_graph
       ) 
  PARTITION (ll_state int)
;


CREATE TABLE  ldlock (id INT PRIMARY KEY)
  ALTER INDEX ldlock ON ldlock PARTITION (id INT)
;

INSERT INTO  ldlock 
  VALUES (0);


CREATE PROCEDURE
ld_dir 
  ( 
    IN path   VARCHAR, 
    IN mask   VARCHAR, 
    IN graph  VARCHAR
  )
{
  DECLARE ls any;
  DECLARE inx int;
  ls := sys_dirlist (path, 1);
  FOR (inx := 0; inx &lt; LENGTH (ls); inx := inx + 1)
    {
      IF (ls[inx] LIKE mask)
    {
      SET ISOLATION = &#39;serializable&#39;;

      IF 
        ( NOT 
          ( EXISTS 
            ( SELECT 1 
                FROM DB.DBA.LOAD_LIST 
                WHERE LL_FILE = path || &#39;/&#39; || ls[inx] FOR UPDATE
            )
          )
        )
        {
          DECLARE gfile, cgfile, ngraph VARCHAR;
          gfile := path || &#39;/&#39; || REPLACE (ls[inx], &#39;.gz&#39;, &#39;&#39;) || &#39;.graph&#39;;
          cgfile := path || &#39;/&#39; || regexp_replace (REPLACE (ls[inx], &#39;.gz&#39;, &#39;&#39;), &#39;\\-[0-9]+\\.n&#39;, &#39;.n&#39;) || &#39;.graph&#39;;
          IF (file_stat (gfile) &lt;&gt; 0)
        ngraph := TRIM (file_to_string (gfile), &#39; \r\n&#39;);
              ELSE IF (file_stat (cgfile) &lt;&gt; 0)
        ngraph := TRIM (file_to_string (cgfile), &#39; \r\n&#39;);
          ELSE IF (file_stat (path || &#39;/&#39; || &#39;global.graph&#39;) &lt;&gt; 0)
        ngraph := TRIM (file_to_string (path || &#39;/&#39; || &#39;global.graph&#39;), &#39; \r\n&#39;);
          ELSE
            ngraph := graph;    
              IF (ngraph IS NOT NULL)
                {        
          INSERT INTO DB.DBA.LOAD_LIST (ll_file, ll_graph) 
            VALUES (path || &#39;/&#39; || ls[inx], ngraph);
        }
        }

      COMMIT WORK;
    }
    }
}
;

CREATE PROCEDURE 
ld_dir_all 
  (
    IN  path   VARCHAR,
    IN  mask   VARCHAR,
    IN  graph  VARCHAR
  )
{
  DECLARE  ls   ANY;
  DECLARE  inx  INT;
  ls := sys_dirlist (path, 0);
  ld_dir (path, mask, graph);
  FOR (inx := 0; inx &lt; LENGTH (ls); inx := inx + 1)
    {
      IF (ls[inx] &lt;&gt; &#39;.&#39; AND ls[inx] &lt;&gt; &#39;..&#39;)
        {
          ld_dir_all (path||&#39;/&#39;||ls[inx], mask, graph);
        }
    }
}
;

CREATE PROCEDURE
ld_add (IN _fname VARCHAR, IN _graph VARCHAR)
{
  --log_message (sprintf (&#39;ld_add: %s, %s&#39;, _fname, _graph));

  SET ISOLATION = &#39;serializable&#39;;

  IF (NOT (EXISTS (SELECT 1 FROM DB.DBA.LOAD_LIST WHERE LL_FILE = _fname FOR UPDATE)))
    {
      INSERT INTO DB.DBA.LOAD_LIST (LL_FILE, LL_GRAPH) VALUES (_fname, _graph);
    }
  COMMIT WORK;
}
;

CREATE PROCEDURE ld_ttlp_flags (IN fname VARCHAR)
{
  IF (fname LIKE &#39;%/btc-2009%&#39; OR fname LIKE &#39;%.nq%&#39;)
    RETURN 255 + 512;
  RETURN 255;
}

CREATE PROCEDURE
ld_file (IN f VARCHAR, IN graph VARCHAR)
{
  DECLARE gzip_name VARCHAR;
  DECLARE EXIT HANDLER FOR SQLSTATE &#39;*&#39; {
    ROLLBACK WORK;
    UPDATE DB.DBA.LOAD_LIST
      SET LL_STATE = 2,
          LL_DONE = CURDATETIME (),
          LL_ERROR = __sql_state || &#39; &#39; || __sql_message
      WHERE LL_FILE = f;
    COMMIT WORK;

    LOG_MESSAGE (sprintf (&#39; File %s error %s %s&#39;, f, __sql_state, __sql_message));
    RETURN;
  };

  IF (f LIKE &#39;%.grdf&#39; OR f LIKE &#39;%.grdf.gz&#39;)
    {
      load_grdf (f);
    }
  ELSE IF (f LIKE &#39;%.gz&#39;)
    {
      gzip_name := regexp_replace (f, &#39;\.gz\x24&#39;, &#39;&#39;);
      IF (gzip_name LIKE &#39;%.xml&#39; OR gzip_name LIKE &#39;%.owl&#39; OR gzip_name LIKE &#39;%.rdf&#39;)
    DB.DBA.RDF_LOAD_RDFXML (gz_file_open (f), graph, graph);
      ELSE IF  (gzip_name like &#39;%.n4&#39;)
    TTLP (gz_file_open (f), graph, graph, 512 + 255);
      ELSE
    TTLP (gz_file_open (f), graph, graph, ld_ttlp_flags (f));
    }
  ELSE
    {
      IF (f LIKE &#39;%.xml&#39; OR f LIKE &#39;%.owl&#39; OR f LIKE &#39;%.rdf&#39;)
    DB.DBA.RDF_LOAD_RDFXML (file_open (f), graph, graph);
      ELSE IF  (f LIKE &#39;%.n4&#39;)
    TTLP (file_open (f), graph, graph, 512 + 255);
      ELSE
    TTLP (file_open (f), graph, graph, ld_ttlp_flags (f));
    }

  --log_message (sprintf (&#39;loaded %s&#39;, f));
}
;

CREATE PROCEDURE
rdf_load_dir (IN path VARCHAR,
              IN mask VARCHAR := &#39;%.nt&#39;,
              IN graph VARCHAR := &#39;http://dbpedia.org&#39;)
{

  DELETE FROM DB.DBA.LOAD_LIST WHERE LL_FILE = &#39;##stop&#39;;
  COMMIT WORK;

  ld_dir (path, mask, graph);

  rdf_loader_run ();
}
;


CREATE PROCEDURE ld_array ()
{
  DECLARE first, last, arr, len, local ANY;
  DECLARE cr CURSOR FOR
      SELECT TOP 100 LL_FILE, LL_GRAPH
        FROM DB.DBA.LOAD_LIST TABLE OPTION (index ll_state)
        WHERE LL_STATE = 0
    FOR UPDATE;
  DECLARE fill INT;
  DECLARE f, g VARCHAR;
  DECLARE r ANY;
  WHENEVER NOT FOUND GOTO DONE;
  first := 0;
  last := 0;
 arr := make_array (100, &#39;any&#39;);
  fill := 0;
  open cr;
  len := 0;
  FOR (;;)
    {
      FETCH cr INTO f, g;
      IF (0 = first) first := f;
      last := f;
      arr[fill] := VECTOR (f, g);
    len := len + cast (file_stat (f, 1) AS INT);
      fill := fill + 1;
      IF (len &gt; 2000000)
    GOTO DONE;
    }
 DONE:
  IF (0 = first)
    RETURN 0;
  IF (1 &lt;&gt; sys_stat (&#39;cl_run_local_only&#39;)) 
    local := sys_stat (&#39;cl_this_host&#39;);
  UPDATE load_list SET ll_state = 1, ll_started = curdatetime (), LL_HOST = local
    WHERE ll_file &gt;= first AND ll_file &lt;= last;
  RETURN arr;
}
;

CREATE PROCEDURE
rdf_loader_run (IN max_files INTEGER := NULL, IN log_enable INT := 2)
{
  DECLARE sec_delay FLOAT;
  DECLARE _f, _graph VARCHAR;
  DECLARE arr ANY;
  DECLARE xx, inx, tx_mode, ld_mode INT;
  ld_mode := log_enable;
  if (0 = sys_stat (&#39;cl_run_local_only&#39;))
    {
      IF (log_enable = 2 AND cl_this_host () = 1)
    {
      cl_exec (&#39;checkpoint_interval (0)&#39;);
      cl_exec (&#39;__dbf_set (&#39;&#39;cl_non_logged_write_mode&#39;&#39;, 1)&#39;);
    }
      IF (cl_this_host () = 1)
    cl_exec(&#39;__dbf_set(&#39;&#39;cl_max_keep_alives_missed&#39;&#39;,3000)&#39;);
    }
  tx_mode := bit_and (1, log_enable);
  log_message (&#39;Loader started&#39;);

  DELETE FROM  DB.DBA.LOAD_LIST WHERE LL_FILE = &#39;##stop&#39;;
  COMMIT WORK;

  WHILE (1)
    {
      SET ISOLATION = &#39;repeatable&#39;;
      DECLARE EXIT HANDLER FOR SQLSTATE &#39;40001&#39; {
    ROLLBACK WORK;
        sec_delay := rnd(1000)*0.001;
    log_message(sprintf(&#39;deadlock in loader, waiting %d milliseconds&#39;, CAST (sec_delay * 1000 AS INTEGER)));
    DELAY(sec_delay);
    GOTO AGAIN;
      };

     AGAIN:;

      IF (EXISTS (SELECT 1 FROM DB.DBA.LOAD_LIST WHERE LL_FILE = &#39;##stop&#39;))
    {
      log_message (&#39;File load stopped by rdf_load_stop.&#39;);
      RETURN;
    }

      log_enable (tx_mode, 1);

      IF (max_files IS NOT NULL AND max_files &lt;= 0)
        {
      COMMIT WORK;
      log_message (&#39;Max_files reached. Finishing.&#39;);
          RETURN;
    }

      WHENEVER NOT FOUND GOTO looks_empty;

      --      log_message (&#39;Getting next file.&#39;);
      SET ISOLATION = &#39;serializable&#39;;
      SELECT id INTO xx FROM ldlock WHERE id = 0 FOR UPDATE;
      arr := ld_array ();
      COMMIT WORK;
      IF (0 = arr)
    GOTO looks_empty;
      log_enable (ld_mode, 1);

      FOR (inx := 0; inx &lt; 100; inx := inx + 1)
    {
      IF (0 = arr[inx])
        GOTO arr_done;
      ld_file (arr[inx][0], arr[inx][1]);
      UPDATE DB.DBA.LOAD_LIST 
        SET  LL_STATE = 2, 
             LL_DONE = CURDATETIME () 
        WHERE LL_FILE = arr[inx][0];
    }
    arr_done:
      log_enable (tx_mode, 1);


      IF (max_files IS NOT NULL) max_files := max_files - 100;

      COMMIT WORK;
    }

 looks_empty:
  COMMIT WORK;
  log_message (&#39;No more files to load. Loader has finished,&#39;);
  RETURN;

}
;

CREATE PROCEDURE rdf_load_stop (IN force INT := 0)
{
  INSERT INTO DB.DBA.LOAD_LIST (LL_FILE) VALUES (&#39;##stop&#39;);
  COMMIT WORK;
  IF (FORCE)
    cl_exec (&#39;txn_killall (1)&#39;);
}
;


CREATE PROCEDURE RDF_LOADER_RUN_1 (IN x INT, IN y INT)
{
  rdf_loader_run (x, y);
}
;

CREATE PROCEDURE rdf_ld_srv (IN log_enable INT)
{
  DECLARE aq ANY;
  aq := async_queue (1);
  aq_request (aq, &#39;DB.DBA.RDF_LOADER_RUN_1&#39;, VECTOR (NULL, log_enable));
  aq_wait_all (aq);
}
;


CREATE PROCEDURE load_grdf (IN f VARCHAR)
{
  DECLARE line ANY;
  DECLARE inx INT;
  DECLARE ses ANY;
  DECLARE gr VARCHAR;

  IF (f LIKE &#39;%.gz&#39;)
    ses := gz_file_open (f);
  ELSE
    ses := file_open (f);
  inx := 0;
  line := &#39;&#39;;
  WHILE (line &lt;&gt; 0)
    { 
      gr := ses_read_line (ses, 0, 0, 1);
      IF (gr = 0) RETURN;
      line := ses_read_line (ses, 0, 0, 1);
      IF (line = 0) RETURN;
      DB.DBA.RDF_LOAD_RDFXML (line, gr, gr);
      inx := inx + 1;
    }
}
;



-- cl_exec (&#39;set lock_escalation_pct = 110&#39;);
-- cl_exec (&#39;DB.DBA.RDF_LD_SRV (1)&#39;) &amp;
-- cl_exec (&#39;DB.DBA.RDF_LD_SRV (2)&#39;) &amp;

</programlisting></listitem>
<listitem>Execute: <programlisting>cl_exec (&#39;checkpoint);
</programlisting></listitem>
<listitem>Execute ld_dir (&#39;directory&#39; , &#39;mask&#39; , &#39;graph&#39;), for ex: <programlisting>ld_dir (&#39;/dbs/data&#39;, &#39;*.gz&#39;, &#39;http://dbpedia.org&#39;);
</programlisting></listitem>
<listitem>Execute on every node with separate client: <programlisting>rdf_loader_run();
</programlisting></listitem>
</orderedlist></section></docbook>