Splitting 3.4billion triple UniProt? datasets file Guide
What?
How to split up the Uniprot's 3.4 billion triples dataset en route to bulk loading into a Virtuoso instance.Why?
Attempting to load single file of this magnitude is an inefficient and problem prone undertaking for any RDF store. Hence the need to break up the file prior to bulk loading.
How?
The following script splits 3.4billion triple uniprot datasets file into smaller files.
The last line of the script tells which file should be split and where the result should go.
The splitter occupies only one CPU core due to its linear nature, so many files could be split in parallel on any multi-core box.
create procedure DB.DBA.RDFXML_FILE_SPLIT_INIT (in out_fname_tmpl varchar, in cut_size integer, inout app_env any) { app_env := vector ( out_fname_tmpl, -- [0] - template for out names string_output (), -- [1] - out session 0, -- [2] - index of current file iri_id_num (#ib1), -- [3] - ID of next bnode to allocate dict_new (1000000), -- [4] - dictionary of bnodes make_array (50000, 'any'), -- [5] - accumulator of triples 0, -- [6] - number of triples in the accumulator vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0), -- [7] env of http_ttl_xxx() 0, -- [8] - count of tripless written to the current file cut_size ); -- [9] - size of single cut file } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_FLUSH (inout app_env any, in can_continue_file integer) { declare tctr, tcount, total_tcount, cut_size integer; declare triples, env, ses any; dbg_obj_princ ('DB.DBA.RDFXML_FILE_SPLIT_FLUSH (..., can_continue_file=', can_continue_file, '): file=', app_env[2], ' tcount=', app_env[6], ' total_count=', app_env[8]); ses := aref_set_0 (app_env, 1); triples := aref_set_0 (app_env, 5); tcount := app_env[6]; env := aref_set_0 (app_env, 7); total_tcount := app_env[8]; cut_size := app_env[9]; for (tctr := 0; tctr < tcount; tctr := tctr + 1) { http_ttl_prefixes (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses); } for (tctr := 0; tctr < tcount; tctr := tctr + 1) { http_ttl_triple (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses); } app_env[6] := 0; total_tcount := total_tcount + tcount; if ((not can_continue_file) or (total_tcount >= cut_size)) { string_to_file (sprintf (app_env[0], app_env[2]), ses, -2); ses := string_output (); env := vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0); app_env[2] := app_env[2] + 1; total_tcount := 0; } aset_zap_arg (app_env, 1, ses); aset_zap_arg (app_env, 5, triples); aset_zap_arg (app_env, 7, env); app_env[8] := total_tcount; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK (inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) { declare i integer; i := app_env[3]; res := iri_id_from_num (i); app_env[3] := i+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID (inout uri varchar, inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) { res := uri; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE ( inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar, inout o_uri varchar, inout app_env any ) { if (app_env[6] >= 50000) DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1); __box_flags_set (o_uri, 1); app_env[5][app_env[6]] := vector (s_uri, p_uri, o_uri); app_env[6] := app_env[6]+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L ( inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar, inout o_val any, inout o_type varchar, inout o_lang varchar, inout app_env any ) { if (app_env[6] >= 50000) DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1); app_env[5][app_env[6]] := vector (s_uri, p_uri, DB.DBA.RDF_MAKE_LONG_OF_TYPEDSQLVAL_STRINGS (o_val, o_type, o_lang)); app_env[6] := app_env[6]+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT (in in_fname varchar, in base varchar, in parse_mode integer, in out_fname_tmpl varchar, in cut_size integer := 100000000) { declare in_ses, app_env any; if (in_fname like '%.rdf.gz' or in_fname like '%.xml.gz') in_ses := gz_file_open (in_fname); else in_ses := file_open (in_fname); DB.DBA.RDFXML_FILE_SPLIT_INIT (out_fname_tmpl, cut_size, app_env); rdf_load_rdfxml (in_ses, parse_mode, '' /* fake graph, UNAME is to avoid copying */, vector ( '', 'DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK', 'DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID', 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE', 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L', '', '' ), app_env, base ); RDFXML_FILE_SPLIT_FLUSH (app_env, 0); } ; DB.DBA.RDFXML_FILE_SPLIT ('/demos/uniprot/src/uniparc.rdf', 'http://purl.uniprot.org/uniparc/', 0, '/demos/uniprot/src/uniparc%06d.ttl', 200000);