# The contents of this file are subject to the Mozilla Public # License Version 1.1 (the "License"); you may not use this file # except in compliance with the License. You may obtain a copy of # the License at http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS # IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or # implied. See the License for the specific language governing # rights and limitations under the License. # # The Original Code is RDFGrabber version 1.0. # # The Initial Developer of the Original Code is European Environment # Agency (EEA). Portions created by EEA are # Copyright (C) European Environment Agency. All # Rights Reserved. # # Contributor(s): # Soren Roug, EEA # Tomas Hjelmberg, CMG # from __future__ import nested_scopes # Here we put the Zope class stuff # Zope imports from DateTime import * import Globals from Globals import Persistent, Acquisition import OFS import AccessControl import binascii,md5 import rdfparser, objects import pickle, types, os, string, re from os.path import join, isfile import const from AccessControl import ClassSecurityInfo from AccessControl.Permissions import view, manage_users from urllib2 import HTTPError _repos = join(CLIENT_HOME, 'RDFGrabber') fixslash = string.maketrans('/','-') class SPO: # Allow (reluctantly) access to unprotected attributes __allow_access_to_unprotected_subobjects__=1 def __init__(self, subject,predicate,object): self._subject = subject self._predicate = predicate self._object = object def strsubject(self): return str(self._subject) def strpredicate(self): return str(self._predicate) def strobject(self): return str(self._object) def subject(self): return self._subject def predicate(self): return self._predicate def object(self): return self._object def __str__(self): if self._object.is_literal(): return '<' + str(self._subject) + '> <' + str(self._predicate) + '> "' + str(self._object) + '"' else: return '<' + str(self._subject) + '> <' + str(self._predicate) + '> <' + str(self._object) + '>' class Individual ( objects.Resource ): security = ClassSecurityInfo() __allow_access_to_unprotected_subobjects__=1 def __init__(self, resource, model): self.model = model self.uri = resource.uri self.anonymous = resource.anonymous self.test = "test" def __cmp__(self, other): return cmp(self.uri, other.uri) def __hash__(self): return hash(self.uri) security.declarePublic( 'has' ) def has(self, property): pos = string.find(property, '_') if (pos == -1): raise AttributeError, "no attribute '%s'" % property namespace = property[:pos] if self.model.namespaces.has_key(namespace): property_name = property[pos+1:] property_uri = self.model.namespaces[namespace] + property_name property_resource = objects.resource( property_uri ) if (self.model.subjects().has_key(str(self))): statements_about_self = self.model.subjects()[str(self)] for statement in statements_about_self: if str(statement.predicate()) == str(property_resource): return 1 return 0 security.declarePublic( '__getattr__' ) def __getattr__( self, property ): asList = 0 inverse = 0 if property.startswith('inv_'): inverse = 1 property = property[4:] if property.startswith('list_'): asList = 1 property = property[5:] pos = string.find(property, '_') if (pos == -1): raise AttributeError, "no attribute '%s'" % property namespace = property[:pos] if not self.model.namespaces.has_key(namespace): raise AttributeError, "no attribute '%s'" % property property_name = property[pos+1:] property_uri = self.model.namespaces[namespace] + property_name property_resource = objects.resource( property_uri ) ret = [] if inverse and (self.model.objects().has_key(str(self))): statements_about_self = self.model.objects()[str(self)] for statement in statements_about_self: if str(statement.predicate()) == str(property_resource): if asList == 0: return self._wrap(statement.subject()) else: ret.append(self._wrap(statement.subject())) elif (self.model.subjects().has_key(str(self))): statements_about_self = self.model.subjects()[str(self)] for statement in statements_about_self: if str(statement.predicate()) == str(property_resource): if asList == 0: return self._wrap(statement.object()) else: ret.append(self._wrap(statement.object())) if len(ret) == 0: raise AttributeError, "no attribute '%s'" % property return ret def _wrap(self, thing): if thing.is_resource(): # test for bag, seq or alt is_container = 0 if (self.model.subjects().has_key(str(thing))): statements_about_thing = self.model.subjects()[str(thing)] for statement in statements_about_thing: if (str(statement.predicate()) == str(const.TYPE)) and (str(statement.object()) in [str(const.BAG), str(const.SEQ), str(const.ALT)]): is_container = 1 break if (is_container == 1): statements_about_container = self.model.subjects()[str(thing)] return map(lambda x: self._wrap(x.object()), filter(lambda y: str(y.predicate()) != str(const.TYPE), statements_about_container)) else: return Individual(thing, self.model) else: return thing Globals.InitializeClass(Individual) class RDFGrabber ( Acquisition.Implicit, Persistent, AccessControl.Role.RoleManager, OFS.SimpleItem.Item): "Retrieve RDF from other websites." __ac_permissions__=( ('View management screens', ('manage_main',)), ('View', ('', 'index_html', 'update','dumbdown', 'query', 'query_html', 'show_source','label_of', 'rdfsources', )), ('Change RDFGrabbers', ('manage_edit',), ('Manager',)), ) manage_options=( {'label':'Properties', 'action':'manage_main'}, {'label':'Query', 'action':'query_html'}, {'label':'Triples', 'action':'triples_html'}, {'label':'Update', 'action':'update'}, {'label':'Source', 'action':'show_source'}, ) + OFS.SimpleItem.SimpleItem.manage_options meta_type = 'RDF Grabber' triples_html = Globals.DTMLFile("htmlview", globals()) show_source = Globals.DTMLFile("source", globals()) manage_main = Globals.DTMLFile("edit_prop", globals()) query_html = Globals.DTMLFile("results", globals()) index_html = Globals.DTMLFile("index_html", globals()) def rdfsources(self,key): "Returns a dictionary of sources where the key is the url" return self._v_rdfsources[key] def lastupdated(self): return self._v_updatedate def triples(self): if (type(self._v_triples) == type([])): return self._v_triples else: return reduce(lambda x, y: x + y, self._v_triples.values()) def subjects(self): return self._v_subjects def predicates(self): return self._v_predicates def objects(self): return self._v_objects def filename(self): return self._filename def lookup_subject(self,subject): if not self._v_subjects.has_key(str(subject)): return [] return self._v_subjects[str(subject)] def lookup_predicate(self,predicate): if not self._v_predicates.has_key(str(predicate)): return [] return self._v_predicates[str(predicate)] def lookup_object(self,object): if not self._v_objects.has_key(str(object)): return [] return self._v_objects[str(object)] def namespace_mapping(self): return string.join(map(lambda x: string.join(x, ' '), self.namespaces.items()), '\n') def _checkStatus(self): if self._v_updatedate + (getattr(self, 'interval', 0) / (24.0 * 60 * 60)) < DateTime(): self.update() def individuals(self, rdfType=None): self._checkStatus() result = set() if not rdfType: for ind in map(lambda x: Individual(x.subject(), self), self._v_predicates[str(const.TYPE)]): result.add(ind) else: for ind in map(lambda x: Individual(x.subject(), self), filter(lambda y: str(y.object()) == rdfType, self._v_predicates[str(const.TYPE)])): result.add(ind) return list(result) def query(self,subject=None,predicate=None,object=None,onehit=None): "Search triples that match the query" res = [] try: if subject: tmp = self._v_subjects[str(subject)] subject=None elif predicate: tmp = self._v_predicates[str(predicate)] predicate=None elif object: tmp = self._v_objects[str(object)] object=None else: if onehit: return self.triples()[0] else: return self.triples() except: tmp = [] for x in tmp: c = x if subject and str(subject) != str(x.subject()): c = None if c and predicate and str(predicate) != str(c.predicate()): c = None if c and object and str(object) != str(c.object()): c = None if c: res.append(c) # if self._v_subjects.has_key(str(subject)): # for x in self._v_subjects[str(subject)]: # if str(x.predicate()) == str(predicate): # res.append(x) if onehit: if len(res) > 0: return res[0] else: return None else: return res def label_of(self,predicate): """Convenience to find the label for a predicate Assumes the RDF schema for that class has been loaded """ for item in self.lookup_subject(predicate): if item.strpredicate() == "http://www.w3.org/2000/01/rdf-schema#label": return item.strobject() return predicate def dumbdown(self,object): """ This convenience method will find follow the pointers (the generated identifiers) for a subject,predicate pair and return a list of objects when it finds an rdf:Bag or an rdf:value """ list = [] return self._dumbdown(self.lookup_object(str(object)),list) def _dumbdown(self,spo,list): for i in spo: if i.object().is_anonymous() is None: if i.strpredicate() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#value" \ or i.strpredicate()[:44] == "http://www.w3.org/1999/02/22-rdf-syntax-ns#_": list.append(i.strobject()) else: self._dumbdown(self.lookup_subject(i.strobject()),list) return list def _loadpickles(self): try: f = open(self.physicalpath(self._filename), 'r') self._v_updatedate = pickle.load(f) self._v_rdfsources = pickle.load(f) self._v_triples = pickle.load(f) self._v_subjects = pickle.load(f) self._v_predicates = pickle.load(f) self._v_objects = pickle.load(f) f.close() except: self._v_updatedate = None self._v_rdfsources = {} self._v_triples = {} self._v_subjects = {} self._v_predicates = {} self._v_objects = {} def __init__(self, id, title, rdfurls, http_proxy, namespaces, interval): self.id = id self.title = title self.rdfurls = rdfurls self.namespaces = {} for mapping in namespaces: tuple = string.split(mapping) self.namespaces[tuple[0]] = tuple[1] self.http_proxy = http_proxy self.interval = interval self._filename = '' self._v_rdfsources = {} self._v_updatedate = None self._v_triples = {} self._v_subjects = {} self._v_predicates = {} self._v_objects = {} def __setstate__(self,state): Persistent.__setstate__(self,state) if not hasattr(self, "_filename"): # backwards compatibility self._filename = self.id if not hasattr(self, "rdfurls"): # backwards compatibility self.rdfurls = [ self.rdfurl ] delattr(self, "rdfurl") self._loadpickles() def _ICanAddCache(self, triples, subject, predicate, object): # Assume spo is shared in all dictionaries spo = SPO(subject, predicate, object) triples.append(spo) def _ICanAdd(self, triples): for spo in triples: subject = spo.subject() predicate = spo.predicate() object = spo.object() x = str(subject) if not self._v_subjects.has_key(x): self._v_subjects[x] = [] self._v_subjects[x].append(spo) x = str(predicate) if not self._v_predicates.has_key(x): self._v_predicates[x] = [] self._v_predicates[x].append(spo) x = str(object) if not self._v_objects.has_key(x): self._v_objects[x] = [] self._v_objects[x].append(spo) def manage_edit(self, title, rdfurls, http_proxy, namespaces, interval, REQUEST=None): "Edits the grabber's characteristics" self.title = title self.rdfurls = rdfurls self.http_proxy = http_proxy self.namespaces = {} for mapping in namespaces: tuple = string.split(mapping) self.namespaces[tuple[0]] = tuple[1] self.interval = interval return self.update(REQUEST) def update(self, REQUEST=None): "Call this function to get it to update its content" # make the directories if not os.path.isdir(_repos): try: os.makedirs(_repos) except: raise OSError, 'Can\'t create directory %s' % _repos cachedate = self._v_updatedate self._v_updatedate = DateTime() if (type(self._v_triples) == type([])): self._v_triples = {} self._v_subjects = {} self._v_predicates = {} self._v_objects = {} self._v_rdfsources = {} for rdfurl in self.rdfurls: new_triples = [] p=rdfparser.RDFParser(lambda s, p, o: self._ICanAddCache(new_triples, s, p, o), http_proxy=self.http_proxy) try: if (cachedate != None): p.parse_url(rdfurl, cachedate.toZone('GMT').strftime('%a, %d %b %Y %H:%M:%S GMT')) else: p.parse_url(rdfurl) self._v_triples[rdfurl] = new_triples self._v_rdfsources[rdfurl] = p.rdfsource except IOError, HTTPError: # assume not modified if not self._v_triples.has_key(rdfurl): self._v_triples[rdfurl] = [] self._ICanAdd(self._v_triples[rdfurl]) fn = self.physicalpath(self._filename) try: os.rename(fn, fn+'.undo') except OSError: pass #write objects f = open(fn, 'w') pickle.dump(self._v_updatedate, f) pickle.dump(self._v_rdfsources, f) pickle.dump(self._v_triples, f) pickle.dump(self._v_subjects, f) pickle.dump(self._v_predicates, f) pickle.dump(self._v_objects, f) f.close() if REQUEST is not None: return Globals.MessageDialog( title='Updated', message='Content of %s has been updated.
' % self.id, action ='manage_main') def _undo(self): """ Restore filename after undo or copy-paste """ if self._filename == '': return fn = self.physicalpath(self._filename) if not isfile(fn) and isfile(fn+'.undo'): os.rename(fn+'.undo', fn) self._loadpickles() def _copy(self, infile, outfile): """ read binary data from infile and write it to outfile infile and outfile my be strings, in which case a file with that name is opened, or filehandles, in which case they are accessed directly. """ if type(infile) is types.StringType: try: instream = open(infile, 'rb') except IOError: self._undo() try: instream = open(infile, 'rb') except IOError: raise IOError, ("%s (%s)" %(self.id, infile)) close_in = 1 else: instream = infile close_in = 0 if type(outfile) is types.StringType: try: outstream = open(outfile, 'wb') except IOError: raise IOError, ("%s (%s)" %(self.id, outfile)) close_out = 1 else: outstream = outfile close_out = 0 try: blocksize = 2<<16 block = instream.read(blocksize) outstream.write(block) while len(block)==blocksize: block = instream.read(blocksize) outstream.write(block) except IOError: raise IOError, ("%s (%s)" %(self.id, filename)) try: instream.seek(0) except: pass if close_in: instream.close() if close_out: outstream.close() def _get_new_ufn(self): """ Create a new unique filename, drop the last newline The base64 set of characters are listed in rfc1341. Unfortunately it includes the / character, and I must deal with that in UNIX systems. """ return string.translate(binascii.b2a_base64(md5.new(self.absolute_url(1)).digest()), fixslash,'\r\n') def physicalpath(self, filename=''): """ Generate the full filename, including directories from _repos and self._filename """ path = _repos if type(filename)==types.ListType: for item in filename: path = join(path,item) elif filename != '': path = join(path,filename) return path ################################ ## Special management methods # ################################ def manage_afterAdd(self, item, container, new_fn=None): """ This method is called, whenever _setObject in ObjectManager gets called. This is the case after a normal add and if the object is a result of cut-paste- or rename-operation. """ new_fn = new_fn or self._get_new_ufn() if self._filename != '': old_fn = self.physicalpath(self._filename) if isfile(old_fn): self._copy(old_fn, self.physicalpath(new_fn)) else: if isfile(old_fn+'.undo'): self._copy(old_fn+'.undo', self.physicalpath(new_fn)) self._filename = new_fn self._loadpickles() return RDFGrabber.inheritedAttribute ("manage_afterAdd") \ (self, item, container) def manage_beforeDelete(self, item, container): """ This method is called, when the object is deleted. To support undo-functionality and because this happens too, when the object is moved (cut-paste) or renamed, the external file is not deleted. It is just renamed to filename.undo and remains in the repository, until it is deleted manually. """ fn = self.physicalpath(self._filename) try: os.unlink(fn+'.undo') except OSError: pass try: os.rename(fn, fn+'.undo') except OSError: pass return RDFGrabber.inheritedAttribute ("manage_beforeDelete") \ (self, item, container) def manage_undo_transactions(self, transaction_info, REQUEST=None): """ This method is called, when the user has chosen an Undo-action. To support undo-functionality the external file is just renamed back from filename.undo to filename. """ fn = self.physicalpath(self._filename) try: os.rename(fn+'.undo', fn) self._loadpickles() except OSError: pass return RDFGrabber.inheritedAttribute ("manage_undo_transactions") \ (self, transaction_info, REQUEST) Globals.default__class_init__(RDFGrabber) def manage_addRDFGrabber(self, id, title, rdfurls, http_proxy, namespaces, interval, REQUEST=None): """Create an object and install it in its parent Folder. The argument 'self' will be bound to the parent Folder. """ grabber = RDFGrabber(id, title, rdfurls, http_proxy, namespaces, interval ) self._setObject(id, grabber) if REQUEST is not None: return self.manage_main(self, REQUEST) manage_addRDFGrabberForm = Globals.DTMLFile('add_grabber', globals())