This HTML5 document contains 32 embedded RDF statements represented using HTML+Microdata notation.

The embedded RDF content will be recognized by any processor of HTML5 Microdata.

PrefixNamespace IRI
dctermshttp://purl.org/dc/terms/
n6http://vos.openlinksw.com/wiki/main/VOS/VirtTipsAndTricksGuideSplittingUnProt/RdfXmlSplitter.
atomhttp://atomowl.org/ontologies/atomrdf#
foafhttp://xmlns.com/foaf/0.1/
n13http://vos.openlinksw.com/dataspace/services/wiki/
oplhttp://www.openlinksw.com/schema/attribution#
n2http://vos.openlinksw.com/dataspace/owiki/wiki/VOS/
n21http://docs.openlinksw.com/virtuoso/rdfsparql.
n19http://vos.openlinksw.com/dataspace/owiki/wiki/VOS/VirtTipsAndTricksGuideSplittingUnProt/sioc.
dchttp://purl.org/dc/elements/1.1/
n9http://vos.openlinksw.com/dataspace/dav#
rdfshttp://www.w3.org/2000/01/rdf-schema#
n12http://rdfs.org/sioc/services#
n10http://vos.openlinksw.com/dataspace/person/dav#
siocthttp://rdfs.org/sioc/types#
n4http://vos.openlinksw.com/dataspace/owiki/wiki/
rdfhttp://www.w3.org/1999/02/22-rdf-syntax-ns#
n15http://vos.openlinksw.com/dataspace/owiki#
xsdhhttp://www.w3.org/2001/XMLSchema#
n8http://vos.openlinksw.com/dataspace/%28NULL%29/wiki/VOS/
n17http://vos.openlinksw.com/dataspace/person/owiki#
siochttp://rdfs.org/sioc/ns#
Subject Item
n10:this
foaf:made
n2:VirtTipsAndTricksGuideSplittingUnProt
Subject Item
n9:this
sioc:creator_of
n2:VirtTipsAndTricksGuideSplittingUnProt
Subject Item
n13:item
n12:services_of
n2:VirtTipsAndTricksGuideSplittingUnProt
Subject Item
n15:this
sioc:creator_of
n2:VirtTipsAndTricksGuideSplittingUnProt
Subject Item
n4:VOS
sioc:container_of
n2:VirtTipsAndTricksGuideSplittingUnProt
atom:entry
n2:VirtTipsAndTricksGuideSplittingUnProt
atom:contains
n2:VirtTipsAndTricksGuideSplittingUnProt
Subject Item
n2:VirtTipsAndTricksGuideSplittingUnProt
rdf:type
sioct:Comment atom:Entry
dcterms:created
2017-06-13T05:39:41.258265
dcterms:modified
2017-06-29T07:41:37.064129
rdfs:label
VirtTipsAndTricksGuideSplittingUnProt
foaf:maker
n10:this n17:this
dc:title
VirtTipsAndTricksGuideSplittingUnProt
opl:isDescribedUsing
n19:rdf
sioc:has_creator
n9:this n15:this
sioc:attachment
n6:sql
sioc:content
%META:TOPICPARENT{name="VirtTipsAndTricksGuide"}% ---+Splitting 3.4billion triple UniProt datasets file Guide ---++What? How to split up the Uniprot's 3.4 billion triples dataset en route to bulk loading into a Virtuoso instance. ---++Why? Attempting to load single file of this magnitude is an inefficient and problem prone undertaking for any RDF store. Hence the need to break up the file prior to bulk loading. ---++How? The <a href="%ATTACHURLPATH%/RdfXmlSplitter.sql" style="wikiautogen">following script</a> splits 3.4billion triple uniprot datasets file into smaller files. The last line of the script tells which file should be split and where the result should go. The splitter occupies only one CPU core due to its linear nature, so many files could be split in parallel on any multi-core box. <verbatim> create procedure DB.DBA.RDFXML_FILE_SPLIT_INIT (in out_fname_tmpl varchar, in cut_size integer, inout app_env any) { app_env := vector ( out_fname_tmpl, -- [0] - template for out names string_output (), -- [1] - out session 0, -- [2] - index of current file iri_id_num (#ib1), -- [3] - ID of next bnode to allocate dict_new (1000000), -- [4] - dictionary of bnodes make_array (50000, 'any'), -- [5] - accumulator of triples 0, -- [6] - number of triples in the accumulator vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0), -- [7] env of http_ttl_xxx() 0, -- [8] - count of tripless written to the current file cut_size ); -- [9] - size of single cut file } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_FLUSH (inout app_env any, in can_continue_file integer) { declare tctr, tcount, total_tcount, cut_size integer; declare triples, env, ses any; dbg_obj_princ ('DB.DBA.RDFXML_FILE_SPLIT_FLUSH (..., can_continue_file=', can_continue_file, '): file=', app_env[2], ' tcount=', app_env[6], ' total_count=', app_env[8]); ses := aref_set_0 (app_env, 1); triples := aref_set_0 (app_env, 5); tcount := app_env[6]; env := aref_set_0 (app_env, 7); total_tcount := app_env[8]; cut_size := app_env[9]; for (tctr := 0; tctr < tcount; tctr := tctr + 1) { http_ttl_prefixes (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses); } for (tctr := 0; tctr < tcount; tctr := tctr + 1) { http_ttl_triple (env, triples[tctr][0], triples[tctr][1], triples[tctr][2], ses); } app_env[6] := 0; total_tcount := total_tcount + tcount; if ((not can_continue_file) or (total_tcount >= cut_size)) { string_to_file (sprintf (app_env[0], app_env[2]), ses, -2); ses := string_output (); env := vector (dict_new (16000), 0, '', '', '', 0, 0, 0, 0); app_env[2] := app_env[2] + 1; total_tcount := 0; } aset_zap_arg (app_env, 1, ses); aset_zap_arg (app_env, 5, triples); aset_zap_arg (app_env, 7, env); app_env[8] := total_tcount; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK (inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) { declare i integer; i := app_env[3]; res := iri_id_from_num (i); app_env[3] := i+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID (inout uri varchar, inout g_iid IRI_ID, inout app_env any, inout res IRI_ID) { res := uri; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE ( inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar, inout o_uri varchar, inout app_env any ) { if (app_env[6] >= 50000) DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1); __box_flags_set (o_uri, 1); app_env[5][app_env[6]] := vector (s_uri, p_uri, o_uri); app_env[6] := app_env[6]+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L ( inout g_iid IRI_ID, inout s_uri varchar, inout p_uri varchar, inout o_val any, inout o_type varchar, inout o_lang varchar, inout app_env any ) { if (app_env[6] >= 50000) DB.DBA.RDFXML_FILE_SPLIT_FLUSH (app_env, 1); app_env[5][app_env[6]] := vector (s_uri, p_uri, DB.DBA.RDF_MAKE_LONG_OF_TYPEDSQLVAL_STRINGS (o_val, o_type, o_lang)); app_env[6] := app_env[6]+1; } ; create procedure DB.DBA.RDFXML_FILE_SPLIT (in in_fname varchar, in base varchar, in parse_mode integer, in out_fname_tmpl varchar, in cut_size integer := 100000000) { declare in_ses, app_env any; if (in_fname like '%.rdf.gz' or in_fname like '%.xml.gz') in_ses := gz_file_open (in_fname); else in_ses := file_open (in_fname); DB.DBA.RDFXML_FILE_SPLIT_INIT (out_fname_tmpl, cut_size, app_env); rdf_load_rdfxml (in_ses, parse_mode, '' /* fake graph, UNAME is to avoid copying */, vector ( '', 'DB.DBA.RDFXML_FILE_SPLIT_EV_NEW_BLANK', 'DB.DBA.RDFXML_FILE_SPLIT_EV_GET_IID', 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE', 'DB.DBA.RDFXML_FILE_SPLIT_EV_TRIPLE_L', '', '' ), app_env, base ); RDFXML_FILE_SPLIT_FLUSH (app_env, 0); } ; DB.DBA.RDFXML_FILE_SPLIT ('/demos/uniprot/src/uniparc.rdf', 'http://purl.uniprot.org/uniparc/', 0, '/demos/uniprot/src/uniparc%06d.ttl', 200000); </verbatim> ---++Related * [[VirtTipsAndTricksGuide][Virtuoso Tips and Tricks Collection]] * [[http://docs.openlinksw.com/virtuoso/rdfsparql.html][Virtuoso Documentation]]
sioc:id
7505540fd1a7bd2b04553f8b2df53466
sioc:link
n2:VirtTipsAndTricksGuideSplittingUnProt
sioc:has_container
n4:VOS
n12:has_services
n13:item
atom:title
VirtTipsAndTricksGuideSplittingUnProt
sioc:links_to
n8:UniProt n21:html
atom:source
n4:VOS
atom:author
n10:this
atom:published
2017-06-13T05:39:41Z
atom:updated
2017-06-29T07:41:37Z
sioc:topic
n4:VOS