RDF dumps from Virtuoso Quad store hosted data

Every DBMS needs to offer a mechanism for bulk export and import of data.

Virtuoso supports dumping and reloading graph model data (e.g., RDF), as well as relational data (e.g., SQL) (discussed elsewhere).

How is it done?

We have created stored procedures for the task. The dump procedures leverage SPARQL to facilitate selective data dump(s) from one or more Named Graphs, each denoted by an IRI.

Dump One Graph


CREATE PROCEDURE dump_one_graph 
  ( IN  srcgraph           VARCHAR  , 
    IN  out_file           VARCHAR  , 
    IN  file_length_limit  INTEGER  := 1000000000
  )
  {
    DECLARE  file_name     VARCHAR;
    DECLARE  env, 
             ses           ANY;
    DECLARE  ses_len, 
             max_ses_len, 
             file_len, 
             file_idx      INTEGER;
    SET ISOLATION = 'uncommitted';
    max_ses_len  := 10000000;
    file_len     := 0;
    file_idx     := 1;
    file_name    := sprintf ('%s%06d.ttl', out_file, file_idx);
    string_to_file ( file_name || '.graph', 
                     srcgraph, 
                     -2
                   );
    string_to_file ( file_name, 
                     sprintf ( '# Dump of graph <%s>, as of %s\n', 
                               srcgraph, 
                               CAST (NOW() AS VARCHAR)
                             ), 
                     -2
                   );
    env := vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0);
    ses := string_output ();
    FOR (SELECT * FROM ( SPARQL DEFINE input:storage "" 
                         SELECT ?s ?p ?o { GRAPH `iri(?:srcgraph)` { ?s ?p ?o } } 
                       ) AS sub OPTION (LOOP)) DO
      {
        http_ttl_triple (env, "s", "p", "o", ses);
        ses_len := length (ses);
        IF (ses_len > max_ses_len)
          {
            file_len := file_len + ses_len;
            IF (file_len > file_length_limit)
              {
                http (' .\n', ses);
                string_to_file (file_name, ses, -1);
                file_len := 0;
                file_idx := file_idx + 1;
                file_name := sprintf ('%s%06d.ttl', out_file, file_idx);
                string_to_file ( file_name, 
                                 sprintf ( '# Dump of graph <%s>, as of %s (part %d)\n', 
                                           srcgraph, 
                                           CAST (NOW() AS VARCHAR), 
                                           file_idx), 
                                 -2
                               );
                 env := VECTOR (dict_new (16000), 0, '', '', '', 0, 0, 0, 0);
              }
            ELSE
              string_to_file (file_name, ses, -1);
            ses := string_output ();
          }
      }
    IF (LENGTH (ses))
      {
        http (' .\n', ses);
        string_to_file (file_name, ses, -1);
      }
  }
;

Dump Multiple Graphs


CREATE PROCEDURE dump_graphs 
  ( IN  dir               VARCHAR  :=  'dumps'   , 
    IN  file_length_limit INTEGER  :=  1000000000
  )
  {
    DECLARE inx INT;
    inx := 1;
    SET ISOLATION = 'uncommitted';
    FOR ( SELECT * 
            FROM ( SPARQL DEFINE input:storage "" 
                   SELECT DISTINCT ?g { GRAPH ?g { ?s ?p ?o } . 
                                        FILTER ( ?g != virtrdf: ) 
                                      } 
                 ) AS sub OPTION ( LOOP )) DO
      {
        dump_one_graph ( "g", 
                         sprintf ('%s/graph%06d_', dir, inx), 
                         file_length_limit
                       );
        inx := inx + 1;
      }
  }
;

Load Graphs

This stored procedure performs a bulk load from a file.


CREATE PROCEDURE load_graphs 
  ( IN  dir  VARCHAR  := 'dumps/' )
  {
    DECLARE  arr  ANY;
    DECLARE  g    VARCHAR;

    arr := sys_dirlist (dir, 1);
    log_enable (2, 1);
    FOREACH (VARCHAR f IN arr) DO
      {
        IF (f LIKE '*.ttl')
          {
            DECLARE CONTINUE HANDLER FOR SQLSTATE '*'
              {
                log_message (sprintf ('Error in %s', f));
              };
            g := file_to_string (dir || '/' || f || '.graph');
            DB.DBA.TTLP_MT (file_open (dir || '/' || f), g, g, 255);      
          }
      }
    EXEC ('checkpoint');
  };

Example

  1. Call the dump_one_graph procedure, for ex. with the following values:

    SQL> dump_one_graph ('http://daas.openlinksw.com/data#', './data_', 1000000000); Done. -- 1438 msec.

  2. As a result, a dump of the graph will be found in the files data_XX (located in your Virtuoso db folder):

    data_000001.ttl data_000002.ttl .... data_000001.ttl.graph

Related