diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..aa21393 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,13 @@ +Copyright 2011 Ronan Klyne + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c62a927 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include config.ini +include LICENSE.txt +recursive-include graphite\Jena-2.6.4 *.txt *.jar +recursive-include graphite *.txt *.ini diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..e2dd253 --- /dev/null +++ b/README.txt @@ -0,0 +1,19 @@ + + +Overview +======== + +Information on this project is currently being maintained at https://github.com/rklyne/python-graphite + +All comments gratefully recieved at python-graphite@rklyne.net + + +Dependencies +============ +This package depends on Python >=2.6 and JPype - http://sourceforge.net/projects/jpype/. + +Installation command: +==================== + +python setup.py install + diff --git a/_cgi.py b/examples/_cgi.py old mode 100644 new mode 100755 similarity index 100% rename from _cgi.py rename to examples/_cgi.py diff --git a/examples/explorer.py b/examples/explorer.py new file mode 100755 index 0000000..4427394 --- /dev/null +++ b/examples/explorer.py @@ -0,0 +1,328 @@ + +import cgi +import cgitb +cgitb.enable() + +fs = cgi.FieldStorage() + +class Response(object): + def __call__(self, data, type="text/html"): + data = str(data) + import sys + w = sys.stdout.write + def wl(text): + w(text+"\n") + wl("Status: 200 OK") + wl("Content-Type: " + type) + wl('') + w(data) + sys.stdout.flush() + import time + time.sleep(1) + sys.exit() + +import rdfgraph + +def main(): + respond = Response() + import os + + path = os.environ.get('PATH_INFO', '') + if path.endswith('/schema'): + return schema_explorer(respond) + + if 'type' in fs: + show_data(respond) + else: + landing_page(respond) + +HTML = """ + + +Semantic web explorer + + + + + +%s + + +""" + +def landing_page(respond): + respond(HTML % ( + """

RDF data explorer

+

This aims to provide the most useful summary possible of any RDF data source.

+
+
+
Data protocol:
+
+ + + +
+
+
Data URL
+
+ +
+ + + """ + )) + + +def chart(data, caption="", threshold=0.95): + "Renders an HTML chart. Data maps name to value." + lst = [(c, p) for (p, c) in data.items()] + lst.sort() + lst.reverse() + total = sum([c for (c, p) in lst]) + if total == 0: + return caption+" No data!" + target = total * threshold + total = 0 + for i, (c, p) in enumerate(lst): + total += c + if total > target: + break + i += 1 + i = min(i, 8) + if i != len(lst): + extras = lst[i:] + extras_total = sum([c for (c, p) in extras]) + lst = lst[:i] + lst.append((extras_total, "Others")) + + + return '\n' \ + + '' \ + + '\n'.join(['' % (p, c) for (c, p) in lst]) \ + + '
'+caption+'
Count
%s%s
' + + +def show_data(respond): + g = rdfgraph.Graph() + data_type = fs['type'].value + url = fs['url'].value + + if url is None: + raise RuntimeError + if data_type == 'sparql': + g.add_endpoint(url) + elif data_type == 'http': + format = None + if url.endswith('.ttl'): + format = 'ttl' + g.load(url, format=format) + else: + return landing_page(respond) + + def quote(thing): + return cgi.escape(unicode(thing)) + + result = '' + g.describe("<%s>" % (url,)) + + resource_count = int(g.sparql(" SELECT ( COUNT ( DISTINCT ?x ) AS ?c ) WHERE { ?x ?y ?z } ").count('c')) + property_count = int(g.sparql("select (count(distinct ?y) as ?c) where {?x ?y ?z}").count('c')) + object_count = int(g.sparql("select (count(distinct ?z) as ?c) where {?x ?y ?z}").count('c')) + triple_count = int(g.sparql("select (count(?z) as ?c) where {?x ?y ?z}").count('c')) + type_count = int(g.sparql("select (count(distinct ?z) as ?c) where {?x a ?z}").count('c')) + typed_resource_count = int(g.sparql("select (count(distinct ?x) as ?c) where {?x a ?z}").count('c')) + + actions = [ + (0, 'triples'), + (type_count, 'type'), + (property_count, 'property'), + (object_count, 'object'), + (resource_count, 'resource'), + ] + actions.sort() + + for weight, action in actions: + if weight > 150: + result += "

Too many %ss to summarise

"%action + continue + else: + result += "

%s

"%action + + if action == 'triples': + result += '

' + chart({ + 'Untyped resources': resource_count-typed_resource_count, + 'Typed resources': typed_resource_count, + 'Properties': property_count, + 'Objects': object_count, + }, caption="Unique URI counts", threshold=2) + + explore_typed = False + if resource_count: + if typed_resource_count/resource_count < 0.1: + result += "Less than 10% of resources are typed. Maybe start looking there?
" + explore_typed = True + + prop_to_res = resource_count/property_count + if prop_to_res < 2: + result += "There are nearly as many properties as resources. This is a web.
" + if prop_to_res > 5: + result += "There are several properties on each resource. This is concentrated information.
" + + result += '''
+

''' + + elif action == 'property': + if True: + props = dict([ + (g.shrink_uri(d.get('y', '')), d['c']) + for d in + g.sparql("select ?y (count(?x) as ?c) where {?x ?y ?z} group by ?y order by desc(?c) limit 10") + if 'y' in d + ]) + else: + ps = g.sparql("select distinct ?y where {?x ?y ?z} limit 50")['y'] + props = {} + for p in ps: + resultlist = g.sparql("select (count(?x) as ?c) where {?x <%s> ?z}" % (p,)) + c = resultlist.count('c') + props[p.shrink_uri()] = c + if props: + result += chart(props, caption="Property frequencies") + + elif action == 'resource': + + rs = g.sparql("select distinct ?x where {?x ?y ?z} limit 150")['x'] + result += "

" + str(len(list(rs))) + ' - ' + ', '.join(map(quote, rs)) + '

' + + + elif action == 'type': + +### + # for d in g.sparql("select ?z (count(distinct ?x) as ?c) where {?x a ?z} group by ?z order by desc(?c) limit 10"): + # raise RuntimeError(d) +### + + if True: + types = dict([ + (d['z'], d['c']) + for d in + g.sparql("select ?z (count(distinct ?x) as ?c) where {?x a ?z} group by ?z order by desc(?c) limit 10") + if 'z' in d + ]) + else: + ts = g.sparql("select distinct ?y where {?x a ?y} limit 50")['y'] + types = {} + for t in ts: + resultlist = g.sparql("select (count(distinct ?x) as ?c) where {?x a <%s>}" % (t,)) + c = resultlist.count('c') + types[t.short_html()] = int(c) + if types: + result += "

Types (%s total)

\n" %type_count + result += chart(types, caption="Type frequencies") + + elif action == 'object': + result += "

Object summary not written

" + + else: + raise RuntimeError("unknown action", action) + + respond( + HTML % ( + "

%s

\n" % quote(url) + + result, + ) + ) + + + +def schema_explorer(respond): + if 'url' not in fs or 'json' not in fs: + #raise RuntimeError(fs.getvalue('url', None)) + respond(HTML %"""

RDF Schema explorer

+ (Reset) +
+

Give me an RDF Type URI and I'll do my best to visualise it.

+ + +
+
+ +
+

+ Built using python-graphite and JavaScript InfoViz Toolkit +
""") + else: + respond(schema_json(), type="application/json") + + +def schema_json(): + g = rdfgraph.Graph() + g.add_inference('schema') + url = fs['url'].value + prop = None + if 'property' in fs: + prop = fs['property'].value + + format = 'ttl' + if url.endswith('.ttl'): + format = 'ttl' + g.load(url, format=format) + r = g[url] + + if prop: + # defunct + data = { + 'subject': url, + 'property': prop, + 'values': [{ + 'id': n.expand_uri(), + 'name': n.shrink_uri(), + } for n in g[url].all(prop)], + } + + ns = r.get_ns() + + import json + + properties = [] + data = { + 'name': r['rdfs:label'], + 'properties': properties, + } + domain_of = list(r.all('-rdfs:domain')) + for p in domain_of: + for r in p.all('rdfs:range'): + properties.append({ + 'id': str(r), + 'name': "(" + p.shrink_uri() + ")" \ + + r.shrink_uri() if hasattr(r, 'shrink_uri') else r, + }) + return json.dumps(data) diff --git a/run.py b/examples/run.py old mode 100644 new mode 100755 similarity index 91% rename from run.py rename to examples/run.py index 2061328..0ef929d --- a/run.py +++ b/examples/run.py @@ -1,7 +1,9 @@ -import rdfgraph +#!/usr/bin/env python + +import graphite def main(): - e = rdfgraph.Graph() + e = graphite.Graph() uri = "http://webscience.org/person/2.n3" # uri = 'http://id.ecs.soton.ac.uk/person/1650' e.load(uri) @@ -19,7 +21,7 @@ def main(): print "People" uri = "http://webscience.org/people.n3" - g = rdfgraph.Graph().load(uri) + g = graphite.Graph().load(uri) names = [] for person in g.all_of_type('foaf:Person').sort('foaf:family_name'): print "-"*40 @@ -28,7 +30,7 @@ def main(): print ', '.join(map(str, names)) - print rdfgraph.Graph(). \ + print graphite.Graph(). \ load("http://webscience.org/people"). \ sparql("PREFIX foaf: SELECT * WHERE { ?person a foaf:Person } LIMIT 5") \ ['person']['foaf:name'].join(', ') \ @@ -42,7 +44,7 @@ def main2(): # Try playing with some Linked4 local govt. data # ( http://linked4.org/lsd/ ) # - graph = rdfgraph.Graph() + graph = graphite.Graph() graph.load_sparql( "http://linked4.org/lsd/sparql", """ @@ -63,7 +65,7 @@ def main3(): # Try playing with some Linked4 local govt. data # ( http://linked4.org/lsd/ ) # - data = rdfgraph.Dataset() + data = graphite.Dataset() data.add_endpoint("http://linked4.org/lsd/sparql") data.add_endpoint(dbpedia) # Royal Borough of Windsor and Maidenhead @@ -82,7 +84,7 @@ def main3(): def main4(): - graph = rdfgraph.Dataset() + graph = graphite.Dataset() graph.add_endpoint("http://services.data.gov.uk/reference/sparql") # graph.add_endpoint("http://linked4.org/lsd/sparql") @@ -99,7 +101,7 @@ def main4(): def explore_types(): - graph = rdfgraph.Graph() + graph = graphite.Graph() graph.add_endpoint("http://linked4.org/lsd/sparql") print graph.all_types().get('rdfs:label').join(', ') diff --git a/Jena-2.6.4/README.txt b/graphite/Jena-2.6.4/README.txt similarity index 100% rename from Jena-2.6.4/README.txt rename to graphite/Jena-2.6.4/README.txt diff --git a/Jena-2.6.4/ReleaseNotes.txt b/graphite/Jena-2.6.4/ReleaseNotes.txt similarity index 100% rename from Jena-2.6.4/ReleaseNotes.txt rename to graphite/Jena-2.6.4/ReleaseNotes.txt diff --git a/Jena-2.6.4/copyright.txt b/graphite/Jena-2.6.4/copyright.txt similarity index 100% rename from Jena-2.6.4/copyright.txt rename to graphite/Jena-2.6.4/copyright.txt diff --git a/graphite/Jena-2.6.4/lib/arq-2.8.7.jar b/graphite/Jena-2.6.4/lib/arq-2.8.7.jar new file mode 100644 index 0000000..3040a99 Binary files /dev/null and b/graphite/Jena-2.6.4/lib/arq-2.8.7.jar differ diff --git a/Jena-2.6.4/lib/arq-2.8.8.jar b/graphite/Jena-2.6.4/lib/arq-2.8.8.jar similarity index 100% rename from Jena-2.6.4/lib/arq-2.8.8.jar rename to graphite/Jena-2.6.4/lib/arq-2.8.8.jar diff --git a/Jena-2.6.4/lib/icu4j-3.4.4.jar b/graphite/Jena-2.6.4/lib/icu4j-3.4.4.jar similarity index 100% rename from Jena-2.6.4/lib/icu4j-3.4.4.jar rename to graphite/Jena-2.6.4/lib/icu4j-3.4.4.jar diff --git a/Jena-2.6.4/lib/iri-0.8.jar b/graphite/Jena-2.6.4/lib/iri-0.8.jar similarity index 100% rename from Jena-2.6.4/lib/iri-0.8.jar rename to graphite/Jena-2.6.4/lib/iri-0.8.jar diff --git a/Jena-2.6.4/lib/jena-2.6.4.jar b/graphite/Jena-2.6.4/lib/jena-2.6.4.jar similarity index 100% rename from Jena-2.6.4/lib/jena-2.6.4.jar rename to graphite/Jena-2.6.4/lib/jena-2.6.4.jar diff --git a/Jena-2.6.4/lib/log4j-1.2.13.jar b/graphite/Jena-2.6.4/lib/log4j-1.2.13.jar similarity index 100% rename from Jena-2.6.4/lib/log4j-1.2.13.jar rename to graphite/Jena-2.6.4/lib/log4j-1.2.13.jar diff --git a/Jena-2.6.4/lib/slf4j-api-1.5.8.jar b/graphite/Jena-2.6.4/lib/slf4j-api-1.5.8.jar similarity index 100% rename from Jena-2.6.4/lib/slf4j-api-1.5.8.jar rename to graphite/Jena-2.6.4/lib/slf4j-api-1.5.8.jar diff --git a/Jena-2.6.4/lib/slf4j-log4j12-1.5.8.jar b/graphite/Jena-2.6.4/lib/slf4j-log4j12-1.5.8.jar similarity index 100% rename from Jena-2.6.4/lib/slf4j-log4j12-1.5.8.jar rename to graphite/Jena-2.6.4/lib/slf4j-log4j12-1.5.8.jar diff --git a/Jena-2.6.4/lib/stax-api-1.0.1.jar b/graphite/Jena-2.6.4/lib/stax-api-1.0.1.jar similarity index 100% rename from Jena-2.6.4/lib/stax-api-1.0.1.jar rename to graphite/Jena-2.6.4/lib/stax-api-1.0.1.jar diff --git a/graphite/Jena-2.6.4/lib/wstx-asl-3.2.9.jar b/graphite/Jena-2.6.4/lib/wstx-asl-3.2.9.jar new file mode 100644 index 0000000..ffdbd1f Binary files /dev/null and b/graphite/Jena-2.6.4/lib/wstx-asl-3.2.9.jar differ diff --git a/Jena-2.6.4/lib/xercesImpl-2.7.1.jar b/graphite/Jena-2.6.4/lib/xercesImpl-2.7.1.jar similarity index 100% rename from Jena-2.6.4/lib/xercesImpl-2.7.1.jar rename to graphite/Jena-2.6.4/lib/xercesImpl-2.7.1.jar diff --git a/Jena-2.6.4/readme.html b/graphite/Jena-2.6.4/readme.html similarity index 100% rename from Jena-2.6.4/readme.html rename to graphite/Jena-2.6.4/readme.html diff --git a/graphite/LICENSE.txt b/graphite/LICENSE.txt new file mode 100644 index 0000000..aa21393 --- /dev/null +++ b/graphite/LICENSE.txt @@ -0,0 +1,13 @@ +Copyright 2011 Ronan Klyne + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/graphite/__init__.py b/graphite/__init__.py new file mode 100644 index 0000000..f577b4c --- /dev/null +++ b/graphite/__init__.py @@ -0,0 +1,2 @@ + +from rdfgraph import Graph, Dataset, Endpoint, Config diff --git a/config.ini b/graphite/config.ini similarity index 100% rename from config.ini rename to graphite/config.ini diff --git a/rdfgraph.py b/graphite/rdfgraph.py similarity index 83% rename from rdfgraph.py rename to graphite/rdfgraph.py index 2a0598b..397c58b 100644 --- a/rdfgraph.py +++ b/graphite/rdfgraph.py @@ -2,6 +2,8 @@ Ripped off from Chris Gutteridge's Graphite: http://graphite.ecs.soton.ac.uk/ """ +from __future__ import print_function + # CONFIG! (finally) class Config(object): @@ -16,12 +18,16 @@ def __init__(self): def load(self): import os base_dir = os.path.dirname(__file__) + work_dir = os.getcwd() import ConfigParser cp = ConfigParser.SafeConfigParser(defaults={ 'jena_libs': 'jena/libs', 'jvm_lib': None, }) - cp.read(self.config_files) + cp.read(map( + lambda name: os.path.join(base_dir, name), + self.config_files, + )) libs_cfg = cp.get('config', 'jena_libs') if libs_cfg: @@ -46,7 +52,7 @@ def load(self): cache_dir = cp.get('config', 'cache_dir') except: cache_dir = self.cache_dir - self.cache_dir = os.path.join(base_dir, cache_dir) + self.cache_dir = os.path.join(work_dir, cache_dir) Config = Config() @@ -88,7 +94,7 @@ def parse_list(self, tpl): yield item elif isinstance(item, (str, unicode)): # Assume it's a URI. Maybe add some literal support later. - yield self[item] + yield self.resource(item) else: yield item @@ -130,7 +136,7 @@ def get_path(self, name): return self.index[name] def get(self, name): if name not in self.index: - raise KeyError, name + raise KeyError(name) with self.open(self.index[name], 'rb') as f: return f.read().decode('utf-8') __getitem__ = get @@ -219,17 +225,27 @@ def __init__(self, uri=None, namespaces=None, engine=None): if namespaces: self.add_ns(namespaces) + @classmethod + def get_default_engine_class(cls): + return getattr(Graph, '_default_graph_class', JenaGraph) + @classmethod + def use_jena(cls): + Graph._default_graph_class = JenaGraph + @classmethod + def use_rdflib(cls): + Graph._default_graph_class = RdflibGraph + def create_default_engine(self): - return JenaGraph() + return self.get_default_engine_class()() @takes_list def read_uri(self, lst, allow_error=False, _cache=[], **k): reload = k.get('reload', False) assert lst, "Load what?" for datum in lst: - assert getattr(datum, 'isURIResource', False), "Can't load " +`datum` + assert getattr(datum, 'isURIResource', False), "Can't load {0!r}".format(datum) try: - self._load_uri(datum.uri, reload=reload, format=k.get('format', None)) + self._load_uri(datum.uri(), reload=reload, format=k.get('format', None)) except: if not allow_error: raise @@ -247,6 +263,7 @@ def _sniff_format(self, data, type=None): return TURTLE elif type in [ 'application/rdf+xml', + 'text/xml', ]: return RDFXML elif type in [ @@ -276,15 +293,18 @@ def _load_uri(self, uri, **k): if 'format' in k: k['format'] = self._parse_rdf_format(k['format']) # Strip the fragment from this URI before caching it. + assert isinstance(uri, (str, unicode)), uri import urlparse uri_key = ''.join(urlparse.urlparse(uri)[:5]) if not reload and uri_key in self.loaded: return self.loaded[uri_key] = True + # I preferred turtle here, but RDFXML seems more robust with dodgy input data. + CACHE_FORMAT = RDFXML if uri in self.web_cache: try: - self.import_uri('file:///'+self.web_cache.get_path(uri), format=TURTLE) + self.import_uri('file:///'+self.web_cache.get_path(uri), format=CACHE_FORMAT) except: - print "Error getting <"+uri+"> from cache" + print("Error getting <"+uri+"> from cache") raise else: import urllib2 @@ -304,15 +324,36 @@ def _load_uri(self, uri, **k): raise RuntimeError("Got HTML data", uri, data, mime) data += f.read() data = data.decode(enc) - self.web_cache[uri] = data + self.engine.load_text(data, format) + # Then write the data to the cache. g = Graph() - g.import_uri('file:///'+self.web_cache.get_path(uri), format=format) - data = g.to_string(format=TURTLE) -# raise RuntimeError(data) - reloaded = g.engine.load_text(data, format=TURTLE) - assert reloaded, reloaded - self.web_cache[uri] = data + g._read_formatted_text(data, format) + data2 = g.to_string(format=CACHE_FORMAT) + # TODO: optimise this out: + # Prove that the data loads before writing it to disk. + g.engine.load_text(data2, format=CACHE_FORMAT) + self.web_cache[uri] = data2 + + def file_uri(self, path): + import urllib + return 'file:'+urllib.pathname2url(path) + + def load_file(self, path, **k): + if 'format' not in k: + with open(path, 'rb') as f: + data = f.read(1024) + k['format'] = self._sniff_format(data) + else: + k['format'] = self._parse_rdf_format(k['format']) + uri = self.file_uri(path) + self.import_uri(uri, **k) + + def save_file(self, path, format='turtle'): + format = self._parse_rdf_format(format) + data = self.engine.to_string(format=format) + with open(path, 'wb') as f: + f.write(data) def import_uri(self, uri, **k): "Load data directly from a URI into the Jena model (uncached)" @@ -348,11 +389,13 @@ def _parse_rdf_format(self, format): def read_text(self, text, mime=None): format = self._sniff_format(text, type=mime) + return self._read_formatted_text(text, format) + def _read_formatted_text(self, text, format): if format == TURTLE: self.read_turtle(text) elif format == N3: self.read_n3(text) - elif format == NTRIPLES: + elif format == NTRIPLE: self.read_ntriples(text) elif format == RDFXML: self.read_rdfxml(text) @@ -540,7 +583,7 @@ def _parse_sparql_result(self, result_obj): def resource(self, uri): if getattr(uri, 'is_resource', False): return uri - return Resource(self, URINode(uri)) + return Resource(self, self._parse_uri(uri)) get = resource __getitem__ = resource def literal(self, thing): @@ -565,15 +608,15 @@ def add_inference(self, type): @takes_list def all_of_type(self, types): for type in types: - for x, y, z in self.triples(None, 'rdf:type', self[type]): + for x, y, z in self.triples(None, 'rdf:type', type): yield x @gives_list def all_types(self): seen = {} for x, y, z in self.triples(None, 'rdf:type', None): - if z in seen: continue - seen[z] = True + if z.value() in seen: continue + seen[z.value()] = True yield z @@ -615,6 +658,18 @@ def __iter__(self): yield x self.iterable = None + def __len__(self): + i = 0 + for _ in self: + i += 1 + return i + + def __repr__(self): + return "["+ ", ".join(map(repr, self)) +"]" + + def __str__(self): + return "["+ ", ".join(map(str, self)) +"]" + class ResourceList(Reiterable): isResourceList = True @@ -739,6 +794,22 @@ def value(self): # Literal return self.datum.value() + def label(self): + lbl = self.get( + "skos:prefLabel", + "rdfs:label", + "foaf:name", + "dct:title", + "dc:title", + "sioc:name", + ) + if lbl: + return str(lbl) + return lbl + + def has_label(self): + return bool(self.label()) + isURIResource = True def __hash__(self): @@ -771,10 +842,11 @@ def inverse_property_values(self): for x, y, z in self.graph.triples(None, None, res._get_raw_datum()): yield y, x - def get(self, prop): + def get(self, *props): "Get a property" - for x in self.all(prop): - return x + for prop in props: + for x in self.all(prop): + return x return None __getitem__ = get @@ -1078,8 +1150,8 @@ def triples(self, x, y, z): }) for uri in endpoints: if Config.sparql_debug: - print "Auto-query:", uri - print query + print("Auto-query: {0}".format(uri)) + print(query) self._triple_query_cache.setdefault(uri, {})[(x, y, z)] = True self.endpoint(uri).construct(self.data_cache, query) # @@ -1106,7 +1178,7 @@ def _sparql(self, query): def _load_all_sparql(self, query): for uri in self.select_endpoints(query): - raise NotImplementedError, "Implement Endpoint class for 'read_sparql'" + raise NotImplementedError("Implement Endpoint class for 'read_sparql'") for x in self.endpoint(uri).select(query): yield x @@ -1132,19 +1204,28 @@ class Engine(object): """Defines an interface for an RDF triple store and query engine. """ def sparql(self, query_text): - raise NotImplemented, "SPARQL querying not supported by this engine" + raise NotImplementedError("SPARQL querying not supported by this engine") def triples(self, subject, predicate, object): - raise NotImplemented, "Select triples from the store" + raise NotImplementedError("Select triples from the store") + + def load_uri(self, uri, format=TURTLE): + raise NotImplementedError("Load RDF from a URI into the store") - def load_uri(self, uri): - raise NotImplemented, "Load RDF from a URI into the store" + def load_text(self, text, format=TURTLE, encoding='utf-8'): + raise NotImplementedError("Load RDF from a string into the store") + + def dump(self, format=TURTLE): + return self.to_string(format=format) + + def to_string(self, format=TURTLE): + raise NotImplementedError("Dump RDF as a string") def expand_uri(self, uri): - raise NotImplementedError, "Expand a URI's shorthand prefix" + raise NotImplementedError("Expand a URI's shorthand prefix") def add_namespace(self, prefix, uri): - raise NotImplementedError, "Register a namespace and it's prefix" + raise NotImplementedError("Register a namespace and it's prefix") import warnings warnings.filterwarnings("ignore", message="the sets module is deprecated") @@ -1214,9 +1295,12 @@ class Node(object): is_blank = False is_uri = False is_literal = False - def __init__(self, datum): + def __init__(self, datum, **k): self.datum = datum + self.init(**k) assert self.check(), datum + def init(self): + pass def __str__(self): return unicode(self.datum) @@ -1243,11 +1327,14 @@ def value(self): def check(self): uri = self.datum assert isinstance(uri, (str, unicode)), (uri, type(uri)) + assert (type(uri) in (str, unicode)), (uri, type(uri)) return True class Literal(Node): is_literal = True def value(self): return self.datum + def init(self, datatype=None): + self.datatype = datatype class Blank(Node): is_blank = True def value(self): @@ -1260,7 +1347,7 @@ def __init__(self, debug=False): self.debug = debug else: def debug(x): - print x + print(x) self.debug = debug runJVM() @@ -1353,7 +1440,7 @@ def _mk_resource(self, res): JPackage(self._jena_pkg_name).rdf.model.Resource, ) assert getattr(res, 'is_node', False), (res, type(res)) - assert res.is_uri, res +# assert res.is_uri, res # XXX: TODO: This breaks with blank nodes, and shouldn't uri = res.datum assert isinstance(uri, (unicode, str)), (uri, type(uri)) return JObject( @@ -1393,6 +1480,8 @@ def _mk_object(self, obj): return obj.datum else: value = obj.value() + if isinstance(value, (str, unicode)): + value = JString(value) return JObject( self.get_model().createTypedLiteral(value), JPackage(self._jena_pkg_name).rdf.model.RDFNode, @@ -1430,13 +1519,13 @@ def load_uri(self, uri, format=None, allow_error=False): else: self.jena_model = jena - def load_text(self, text, format=TURTLE): + def load_text(self, text, format=TURTLE, encoding='utf-8'): format = self.get_jena_format(format) self.debug("JENA load text "+format) jena = self.get_model() uri = "tag:string-input" - if isinstance(text, unicode): - text = text.encode('utf-8') + if not isinstance(text, unicode): + text = unicode(text, encoding) jstr = JString(text) input = JClass('java.io.StringReader')(jstr) jena = jena.read(input, uri, format) @@ -1448,7 +1537,7 @@ def import_sparql(self, endpoint, query): qexec.execConstruct(self.jena_model) def has_triple(self, x, y, z): - self.debug(' '.join(["JENA has_triple ", `x`, `y`, `z`])) + self.debug(' '.join(["JENA has_triple ", repr(x), repr(y), repr(z)])) jena = self.get_model() sub = self._mk_resource(x) pred = self._mk_property(y) @@ -1456,7 +1545,7 @@ def has_triple(self, x, y, z): return bool(jena.contains(sub, pred, ob)) def set_triple(self, x, y, z): - self.debug(' '.join(["JENA add_triple ", `x`, `y`, `z`])) + self.debug(' '.join(["JENA add_triple ", repr(x), repr(y), repr(z)])) jena = self.get_model() sub = self._mk_resource(x) pred = self._mk_property(y) @@ -1469,7 +1558,7 @@ def set_triple(self, x, y, z): jena.add(stmt) def remove_triples(self, x, y, z): - self.debug(' '.join(["JENA remove_triples ", `x`, `y`, `z`])) + self.debug(' '.join(["JENA remove_triples ", repr(x), repr(y), repr(z)])) jena = self.get_model() sub = self._mk_resource(x) pred = self._mk_property(y) @@ -1477,7 +1566,7 @@ def remove_triples(self, x, y, z): jena.removeAll(sub, pred, ob) def triples(self, x, y, z): - self.debug(' '.join(["JENA triples ", `x`, `y`, `z`])) + self.debug(' '.join(["JENA triples ", repr(x), repr(y), repr(z)])) jena = self.get_model() sub = self._mk_resource(x) pred = self._mk_property(y) @@ -1539,3 +1628,120 @@ def sparql(self, query_text): # JenaGraph query = q_pkg.QueryFactory.create(query_text) qexec = q_pkg.QueryExecutionFactory.create(query, model) return self._iter_sparql_results(qexec) + + +class RdflibGraph(Engine, Jena): + """Defines a mechanism for accessing a triple store in rdflib. + """ + def __init__(self, **k): + super(RdflibGraph, self).__init__(**k) + import rdflib + import rdfextras + self.graph = rdflib.Graph() + + def sparql(self, query_text): + qres = self.graph.query(query_text) + qvars = qres.vars + #raise RuntimeError(qres.bindings, query_text) + for soln in qres.bindings: + d = {} + for v in qvars: + try: + value = soln[v] + except KeyError: + continue + parsed_value = self._convert_rdflib_value(value) + d[v.toPython()[1:]] = parsed_value + #raise RuntimeError(d, soln) + yield d + #raise NotImplementedError, "SPARQL querying not supported by this engine" + + def _convert_data_value(self, val): + if val is None: return None + import rdflib + if isinstance(val, URINode): + return rdflib.URIRef(val.value()) + if isinstance(val, Literal): + return rdflib.Literal(val.value()) + raise ValueError(val) + + def _convert_rdflib_value(self, val): + if val is None: + raise ValueError(val) + import rdflib + if isinstance(val, rdflib.URIRef): + return URINode(val.toPython()) + if isinstance(val, rdflib.BNode): + return Blank(str(val)) + if isinstance(val, rdflib.Literal): + datatype = val.datatype + if datatype is not None: + datatype = datatype.toPython() + return Literal(val.toPython(), datatype=datatype) + raise ValueError(val) + + def set_triple(self, subject, predicate, object): + self.graph.add(( + self._convert_data_value(subject), + self._convert_data_value(predicate), + self._convert_data_value(object), + )) + + def remove_triples(self, subject, predicate, object): + self.graph.remove(( + self._convert_data_value(subject), + self._convert_data_value(predicate), + self._convert_data_value(object), + )) + + def _triples(self, subject, predicate, object): + for s, p, o in self.graph.triples(( + self._convert_data_value(subject), + self._convert_data_value(predicate), + self._convert_data_value(object), + )): + yield ( + self._convert_rdflib_value(s), + self._convert_rdflib_value(p), + self._convert_rdflib_value(o), + ) + + def triples(self, subject, predicate, object): + return list(self._triples(subject, predicate, object)) + + def load_uri(self, uri, format=TURTLE): + return self.graph.parse(uri, format=self._convert_format_id(format)) + + def load_text(self, text, format=TURTLE, encoding='utf8'): + #u_text = text.decode(encoding) + u_text = text + return self.graph.parse(data=u_text, format=self._convert_format_id(format)) + + def _convert_format_id(self, format): + if format in (TURTLE, N3): + return 'n3' + if format in (NTRIPLE, ): + return 'n3' + if format in (RDFXML, ): + return 'xml' + raise ValueError("Unhandled RDF format descriptor", format) + + def expand_uri(self, uri): + if ':' not in uri: + return uri + prefix, rest = uri.split(':', 1) + for p, r in self.graph.namespaces(): + if prefix == p: + return r + rest + return uri + + def add_namespace(self, prefix, uri): + return self.graph.bind(prefix, uri, True) + + def to_string(self, format=TURTLE): + return self.graph.serialize(format=self._convert_format_id(format)) + + +# Hook in a more sensible default ;-) +Graph.use_rdflib() + diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..3724619 --- /dev/null +++ b/setup.py @@ -0,0 +1,24 @@ + +from distutils.core import setup + +setup( + name='python-graphite', + version='0.2.2', + author='Ronan Klyne', + author_email='python-graphite@rklyne.net', + packages=['graphite'], + package_data={ + 'graphite': [ + 'config.ini', + '*.txt', + 'Jena-2.6.4/*.txt', + 'Jena-2.6.4/*.html', + 'Jena-2.6.4/lib/*.jar', + ], + }, + scripts=[], + url='http://code.google.com/p/python-graphite/', + license='LICENSE.txt', + description='A flexible RDF hacking library built on JPype and Jena', + long_description=open('README.txt').read(), +) diff --git a/test.py b/test/test.py old mode 100644 new mode 100755 similarity index 62% rename from test.py rename to test/test.py index 378b22b..0505ce7 --- a/test.py +++ b/test/test.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """ These test may seem a bit light. If they don't then they should. @@ -5,15 +6,23 @@ """ import unittest -import rdfgraph class Test(unittest.TestCase): verbose = False + + def new_graph(self, g=None): + if g is None: + g = rdfgraph.Graph() + self.g = g + def setUp(self): - self.g = rdfgraph.Graph() + self.new_graph() def tearDown(self): self.g = None + def file_data(self, data): + return TempFile(data) + SAMPLE_RDFXML = """ @@ -39,6 +48,26 @@ def tearDown(self): """ +class TempFile(object): + def __init__(self, data): + assert isinstance(data, str) # Only permit bytes here. + self.data = data + + def __enter__(self): + import tempfile + tpl = tempfile.mkstemp() + fn, self.name = tpl + tf = open(self.name, 'wb') + tf.write(self.data) + tf.close() + return self.name + + def __exit__(self, a,b,c): + try: + import os + os.remove(self.name) + except: pass + class TestRead(Test): def test_read_XML(self): self.g.load_rdfxml(SAMPLE_RDFXML) @@ -105,6 +134,14 @@ def test_set(self, other=None): self.failUnless(r['tag:p']) self.assertEquals(r['tag:p'], other) + def test_set_char(self): + # Check single characters + r = self.g.get('tag:dummy1') + char = 'A' + r['tag:char'] = char + self.failUnless(r['tag:char']) + self.assertEquals(r['tag:char'], char) + def test_set_literal(self): self.test_set(other=2) self.test_set(other="Wibble") @@ -212,9 +249,103 @@ def test_join(self): lst1 ) +class TestUnicode(Test): + + u_lit = u'\x9c' # What iso-8859-1 calls '\xa3' - the British Pound sign. + u_ttl = ''' + @prefix xsd: . + "\xc2\x9c"^^xsd:string . + ''' + _rel = 'tag:new_relation' + _res = 'tag:new_resource' + + def assert_loaded(self, g=None): + if g is None: + g = self.g + ts = list(g.triples(None, None, None)) + self.assertEquals(len(ts), 1) + self.assertEquals(self.u_lit, g[self._res][self._rel]) + + def assert_not_loaded(self, g=None): + if g is None: + g = self.g + ts = list(g.triples(None, None, None)) + self.assertEquals(len(ts), 0) + + def test_ttl_load(self): + self.g.load_turtle(self.u_ttl) + self.assert_loaded() + + def test_ttl_load_file(self, use_cache=False): + import os + self.assert_not_loaded() + with self.file_data(self.u_ttl) as f: + self.failUnless(os.path.isfile(f), f) + with open(f, 'rb') as fp: + self.failUnless(fp, f) + if use_cache: + uri = self.g.file_uri(f) + self.g.load(uri) + else: + self.g.load_file(f) + self.assert_loaded() + + def test_ttl_load_file_with_cache(self): + self.test_ttl_load_file(True) + + def test_set_literal(self): + r = self.g[self._res] + r.set(self._rel, self.u_lit) + self.assertEquals(self.u_lit, self.g[self._res][self._rel]) + + def test_save_and_load(self): + import tempfile + fno, name = tempfile.mkstemp() + self.g.load_turtle(self.u_ttl) + self.assert_loaded() + self.g.save_file(name) + + # The test of save is whether we can load it or not. + self.new_graph() + self.assert_not_loaded() + self.g.load_file(name) + self.assert_loaded() + + +class TestSparql(Test): + def setUp(self): + super(TestSparql, self).setUp() + self.g.load_ttl(""" + + a . + """) + + def test_select(self): + results = self.g.sparql("select ?s ?p ?o where {?s ?p ?o}") + self.failUnless(results) + for var, expected in [ + ('s', 'tag:dummy1'), + ('p', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'), + ('o', 'tag:dummy2'), + ]: + lst = list(results[var]) + self.assertEquals(len(lst), 1) + self.assertEquals(lst[0], expected) if __name__ == '__main__': + # A bit of bootstrap to make sure we test the right stuff import sys + import os + mod_path = os.path.join(os.path.dirname(__file__), os.pardir) + mod_path = os.path.abspath(mod_path) + + print "Testing in", mod_path + sys.path.insert(0, mod_path) + + import graphite.rdfgraph as rdfgraph + globals()['rdfgraph'] = rdfgraph + + # Kick off the tests unittest.main(argv=sys.argv)