# The contents of this file are subject to the Mozilla Public
# License Version 1.1 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of
# the License at http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
# implied. See the License for the specific language governing
# rights and limitations under the License.
#
# The Original Code is RDFGrabber version 1.0.
#
# The Initial Developer of the Original Code is European Environment
# Agency (EEA). Portions created by EEA are
# Copyright (C) European Environment Agency. All
# Rights Reserved.
#
# Contributor(s):
# Soren Roug, EEA
# Tomas Hjelmberg, CMG
#
from __future__ import nested_scopes
# Here we put the Zope class stuff
# Zope imports
from DateTime import *
import Globals
from Globals import Persistent, Acquisition
import OFS
import AccessControl
import binascii,md5
import rdfparser, objects
import pickle, types, os, string, re
from os.path import join, isfile
import const
from AccessControl import ClassSecurityInfo
from AccessControl.Permissions import view, manage_users
from urllib2 import HTTPError
_repos = join(CLIENT_HOME, 'RDFGrabber')
fixslash = string.maketrans('/','-')
class SPO:
# Allow (reluctantly) access to unprotected attributes
__allow_access_to_unprotected_subobjects__=1
def __init__(self, subject,predicate,object):
self._subject = subject
self._predicate = predicate
self._object = object
def strsubject(self):
return str(self._subject)
def strpredicate(self):
return str(self._predicate)
def strobject(self):
return str(self._object)
def subject(self):
return self._subject
def predicate(self):
return self._predicate
def object(self):
return self._object
def __str__(self):
if self._object.is_literal():
return '<' + str(self._subject) + '> <' + str(self._predicate) + '> "' + str(self._object) + '"'
else:
return '<' + str(self._subject) + '> <' + str(self._predicate) + '> <' + str(self._object) + '>'
class Individual ( objects.Resource ):
security = ClassSecurityInfo()
__allow_access_to_unprotected_subobjects__=1
def __init__(self, resource, model):
self.model = model
self.uri = resource.uri
self.anonymous = resource.anonymous
self.test = "test"
def __cmp__(self, other):
return cmp(self.uri, other.uri)
def __hash__(self):
return hash(self.uri)
security.declarePublic( 'has' )
def has(self, property):
pos = string.find(property, '_')
if (pos == -1):
raise AttributeError, "no attribute '%s'" % property
namespace = property[:pos]
if self.model.namespaces.has_key(namespace):
property_name = property[pos+1:]
property_uri = self.model.namespaces[namespace] + property_name
property_resource = objects.resource( property_uri )
if (self.model.subjects().has_key(str(self))):
statements_about_self = self.model.subjects()[str(self)]
for statement in statements_about_self:
if str(statement.predicate()) == str(property_resource):
return 1
return 0
security.declarePublic( '__getattr__' )
def __getattr__( self, property ):
asList = 0
inverse = 0
if property.startswith('inv_'):
inverse = 1
property = property[4:]
if property.startswith('list_'):
asList = 1
property = property[5:]
pos = string.find(property, '_')
if (pos == -1):
raise AttributeError, "no attribute '%s'" % property
namespace = property[:pos]
if not self.model.namespaces.has_key(namespace):
raise AttributeError, "no attribute '%s'" % property
property_name = property[pos+1:]
property_uri = self.model.namespaces[namespace] + property_name
property_resource = objects.resource( property_uri )
ret = []
if inverse and (self.model.objects().has_key(str(self))):
statements_about_self = self.model.objects()[str(self)]
for statement in statements_about_self:
if str(statement.predicate()) == str(property_resource):
if asList == 0:
return self._wrap(statement.subject())
else:
ret.append(self._wrap(statement.subject()))
elif (self.model.subjects().has_key(str(self))):
statements_about_self = self.model.subjects()[str(self)]
for statement in statements_about_self:
if str(statement.predicate()) == str(property_resource):
if asList == 0:
return self._wrap(statement.object())
else:
ret.append(self._wrap(statement.object()))
if len(ret) == 0:
raise AttributeError, "no attribute '%s'" % property
return ret
def _wrap(self, thing):
if thing.is_resource():
# test for bag, seq or alt
is_container = 0
if (self.model.subjects().has_key(str(thing))):
statements_about_thing = self.model.subjects()[str(thing)]
for statement in statements_about_thing:
if (str(statement.predicate()) == str(const.TYPE)) and (str(statement.object()) in [str(const.BAG), str(const.SEQ), str(const.ALT)]):
is_container = 1
break
if (is_container == 1):
statements_about_container = self.model.subjects()[str(thing)]
return map(lambda x: self._wrap(x.object()),
filter(lambda y: str(y.predicate()) != str(const.TYPE),
statements_about_container))
else:
return Individual(thing, self.model)
else:
return thing
Globals.InitializeClass(Individual)
class RDFGrabber (
Acquisition.Implicit,
Persistent,
AccessControl.Role.RoleManager,
OFS.SimpleItem.Item):
"Retrieve RDF from other websites."
__ac_permissions__=(
('View management screens', ('manage_main',)),
('View', ('', 'index_html', 'update','dumbdown',
'query', 'query_html', 'show_source','label_of',
'rdfsources',
)),
('Change RDFGrabbers', ('manage_edit',), ('Manager',)),
)
manage_options=(
{'label':'Properties', 'action':'manage_main'},
{'label':'Query', 'action':'query_html'},
{'label':'Triples', 'action':'triples_html'},
{'label':'Update', 'action':'update'},
{'label':'Source', 'action':'show_source'},
) + OFS.SimpleItem.SimpleItem.manage_options
meta_type = 'RDF Grabber'
triples_html = Globals.DTMLFile("htmlview", globals())
show_source = Globals.DTMLFile("source", globals())
manage_main = Globals.DTMLFile("edit_prop", globals())
query_html = Globals.DTMLFile("results", globals())
index_html = Globals.DTMLFile("index_html", globals())
def rdfsources(self,key):
"Returns a dictionary of sources where the key is the url"
return self._v_rdfsources[key]
def lastupdated(self):
return self._v_updatedate
def triples(self):
if (type(self._v_triples) == type([])):
return self._v_triples
else:
return reduce(lambda x, y: x + y, self._v_triples.values())
def subjects(self):
return self._v_subjects
def predicates(self):
return self._v_predicates
def objects(self):
return self._v_objects
def filename(self):
return self._filename
def lookup_subject(self,subject):
if not self._v_subjects.has_key(str(subject)):
return []
return self._v_subjects[str(subject)]
def lookup_predicate(self,predicate):
if not self._v_predicates.has_key(str(predicate)):
return []
return self._v_predicates[str(predicate)]
def lookup_object(self,object):
if not self._v_objects.has_key(str(object)):
return []
return self._v_objects[str(object)]
def namespace_mapping(self):
return string.join(map(lambda x: string.join(x, ' '),
self.namespaces.items()),
'\n')
def _checkStatus(self):
if self._v_updatedate + (getattr(self, 'interval', 0) / (24.0 * 60 * 60)) < DateTime():
self.update()
def individuals(self, rdfType=None):
self._checkStatus()
result = set()
if not rdfType:
for ind in map(lambda x: Individual(x.subject(), self),
self._v_predicates[str(const.TYPE)]):
result.add(ind)
else:
for ind in map(lambda x: Individual(x.subject(), self),
filter(lambda y: str(y.object()) == rdfType,
self._v_predicates[str(const.TYPE)])):
result.add(ind)
return list(result)
def query(self,subject=None,predicate=None,object=None,onehit=None):
"Search triples that match the query"
res = []
try:
if subject:
tmp = self._v_subjects[str(subject)]
subject=None
elif predicate:
tmp = self._v_predicates[str(predicate)]
predicate=None
elif object:
tmp = self._v_objects[str(object)]
object=None
else:
if onehit:
return self.triples()[0]
else:
return self.triples()
except:
tmp = []
for x in tmp:
c = x
if subject and str(subject) != str(x.subject()):
c = None
if c and predicate and str(predicate) != str(c.predicate()):
c = None
if c and object and str(object) != str(c.object()):
c = None
if c:
res.append(c)
# if self._v_subjects.has_key(str(subject)):
# for x in self._v_subjects[str(subject)]:
# if str(x.predicate()) == str(predicate):
# res.append(x)
if onehit:
if len(res) > 0:
return res[0]
else:
return None
else:
return res
def label_of(self,predicate):
"""Convenience to find the label for a predicate
Assumes the RDF schema for that class has been loaded
"""
for item in self.lookup_subject(predicate):
if item.strpredicate() == "http://www.w3.org/2000/01/rdf-schema#label":
return item.strobject()
return predicate
def dumbdown(self,object):
"""
This convenience method will find follow the pointers
(the generated identifiers) for a subject,predicate pair
and return a list of objects when it finds
an rdf:Bag or an rdf:value
"""
list = []
return self._dumbdown(self.lookup_object(str(object)),list)
def _dumbdown(self,spo,list):
for i in spo:
if i.object().is_anonymous() is None:
if i.strpredicate() == "http://www.w3.org/1999/02/22-rdf-syntax-ns#value" \
or i.strpredicate()[:44] == "http://www.w3.org/1999/02/22-rdf-syntax-ns#_":
list.append(i.strobject())
else:
self._dumbdown(self.lookup_subject(i.strobject()),list)
return list
def _loadpickles(self):
try:
f = open(self.physicalpath(self._filename), 'r')
self._v_updatedate = pickle.load(f)
self._v_rdfsources = pickle.load(f)
self._v_triples = pickle.load(f)
self._v_subjects = pickle.load(f)
self._v_predicates = pickle.load(f)
self._v_objects = pickle.load(f)
f.close()
except:
self._v_updatedate = None
self._v_rdfsources = {}
self._v_triples = {}
self._v_subjects = {}
self._v_predicates = {}
self._v_objects = {}
def __init__(self, id, title, rdfurls, http_proxy, namespaces, interval):
self.id = id
self.title = title
self.rdfurls = rdfurls
self.namespaces = {}
for mapping in namespaces:
tuple = string.split(mapping)
self.namespaces[tuple[0]] = tuple[1]
self.http_proxy = http_proxy
self.interval = interval
self._filename = ''
self._v_rdfsources = {}
self._v_updatedate = None
self._v_triples = {}
self._v_subjects = {}
self._v_predicates = {}
self._v_objects = {}
def __setstate__(self,state):
Persistent.__setstate__(self,state)
if not hasattr(self, "_filename"): # backwards compatibility
self._filename = self.id
if not hasattr(self, "rdfurls"): # backwards compatibility
self.rdfurls = [ self.rdfurl ]
delattr(self, "rdfurl")
self._loadpickles()
def _ICanAddCache(self, triples, subject, predicate, object):
# Assume spo is shared in all dictionaries
spo = SPO(subject, predicate, object)
triples.append(spo)
def _ICanAdd(self, triples):
for spo in triples:
subject = spo.subject()
predicate = spo.predicate()
object = spo.object()
x = str(subject)
if not self._v_subjects.has_key(x):
self._v_subjects[x] = []
self._v_subjects[x].append(spo)
x = str(predicate)
if not self._v_predicates.has_key(x):
self._v_predicates[x] = []
self._v_predicates[x].append(spo)
x = str(object)
if not self._v_objects.has_key(x):
self._v_objects[x] = []
self._v_objects[x].append(spo)
def manage_edit(self, title, rdfurls, http_proxy, namespaces, interval, REQUEST=None):
"Edits the grabber's characteristics"
self.title = title
self.rdfurls = rdfurls
self.http_proxy = http_proxy
self.namespaces = {}
for mapping in namespaces:
tuple = string.split(mapping)
self.namespaces[tuple[0]] = tuple[1]
self.interval = interval
return self.update(REQUEST)
def update(self, REQUEST=None):
"Call this function to get it to update its content"
# make the directories
if not os.path.isdir(_repos):
try:
os.makedirs(_repos)
except:
raise OSError, 'Can\'t create directory %s' % _repos
cachedate = self._v_updatedate
self._v_updatedate = DateTime()
if (type(self._v_triples) == type([])):
self._v_triples = {}
self._v_subjects = {}
self._v_predicates = {}
self._v_objects = {}
self._v_rdfsources = {}
for rdfurl in self.rdfurls:
new_triples = []
p=rdfparser.RDFParser(lambda s, p, o: self._ICanAddCache(new_triples, s, p, o), http_proxy=self.http_proxy)
try:
if (cachedate != None):
p.parse_url(rdfurl, cachedate.toZone('GMT').strftime('%a, %d %b %Y %H:%M:%S GMT'))
else:
p.parse_url(rdfurl)
self._v_triples[rdfurl] = new_triples
self._v_rdfsources[rdfurl] = p.rdfsource
except IOError, HTTPError: # assume not modified
if not self._v_triples.has_key(rdfurl):
self._v_triples[rdfurl] = []
self._ICanAdd(self._v_triples[rdfurl])
fn = self.physicalpath(self._filename)
try:
os.rename(fn, fn+'.undo')
except OSError:
pass
#write objects
f = open(fn, 'w')
pickle.dump(self._v_updatedate, f)
pickle.dump(self._v_rdfsources, f)
pickle.dump(self._v_triples, f)
pickle.dump(self._v_subjects, f)
pickle.dump(self._v_predicates, f)
pickle.dump(self._v_objects, f)
f.close()
if REQUEST is not None:
return Globals.MessageDialog(
title='Updated',
message='Content of %s has been updated.
' %
self.id, action ='manage_main')
def _undo(self):
""" Restore filename after undo or copy-paste """
if self._filename == '':
return
fn = self.physicalpath(self._filename)
if not isfile(fn) and isfile(fn+'.undo'):
os.rename(fn+'.undo', fn)
self._loadpickles()
def _copy(self, infile, outfile):
""" read binary data from infile and write it to outfile
infile and outfile my be strings, in which case a file with that
name is opened, or filehandles, in which case they are accessed
directly.
"""
if type(infile) is types.StringType:
try:
instream = open(infile, 'rb')
except IOError:
self._undo()
try:
instream = open(infile, 'rb')
except IOError:
raise IOError, ("%s (%s)" %(self.id, infile))
close_in = 1
else:
instream = infile
close_in = 0
if type(outfile) is types.StringType:
try:
outstream = open(outfile, 'wb')
except IOError:
raise IOError, ("%s (%s)" %(self.id, outfile))
close_out = 1
else:
outstream = outfile
close_out = 0
try:
blocksize = 2<<16
block = instream.read(blocksize)
outstream.write(block)
while len(block)==blocksize:
block = instream.read(blocksize)
outstream.write(block)
except IOError:
raise IOError, ("%s (%s)" %(self.id, filename))
try: instream.seek(0)
except: pass
if close_in: instream.close()
if close_out: outstream.close()
def _get_new_ufn(self):
""" Create a new unique filename, drop the last newline
The base64 set of characters are listed in rfc1341. Unfortunately
it includes the / character, and I must deal with that in UNIX systems.
"""
return string.translate(binascii.b2a_base64(md5.new(self.absolute_url(1)).digest()),
fixslash,'\r\n')
def physicalpath(self, filename=''):
""" Generate the full filename, including directories from
_repos and self._filename
"""
path = _repos
if type(filename)==types.ListType:
for item in filename:
path = join(path,item)
elif filename != '':
path = join(path,filename)
return path
################################
## Special management methods #
################################
def manage_afterAdd(self, item, container, new_fn=None):
""" This method is called, whenever _setObject in ObjectManager gets
called. This is the case after a normal add and if the object is a
result of cut-paste- or rename-operation.
"""
new_fn = new_fn or self._get_new_ufn()
if self._filename != '':
old_fn = self.physicalpath(self._filename)
if isfile(old_fn):
self._copy(old_fn, self.physicalpath(new_fn))
else:
if isfile(old_fn+'.undo'):
self._copy(old_fn+'.undo', self.physicalpath(new_fn))
self._filename = new_fn
self._loadpickles()
return RDFGrabber.inheritedAttribute ("manage_afterAdd") \
(self, item, container)
def manage_beforeDelete(self, item, container):
""" This method is called, when the object is deleted. To support
undo-functionality and because this happens too, when the object
is moved (cut-paste) or renamed, the external file is not deleted.
It is just renamed to filename.undo and remains in the
repository, until it is deleted manually.
"""
fn = self.physicalpath(self._filename)
try:
os.unlink(fn+'.undo')
except OSError:
pass
try:
os.rename(fn, fn+'.undo')
except OSError:
pass
return RDFGrabber.inheritedAttribute ("manage_beforeDelete") \
(self, item, container)
def manage_undo_transactions(self, transaction_info, REQUEST=None):
""" This method is called, when the user has chosen an Undo-action.
To support undo-functionality the external file is just renamed back from
filename.undo to filename.
"""
fn = self.physicalpath(self._filename)
try:
os.rename(fn+'.undo', fn)
self._loadpickles()
except OSError:
pass
return RDFGrabber.inheritedAttribute ("manage_undo_transactions") \
(self, transaction_info, REQUEST)
Globals.default__class_init__(RDFGrabber)
def manage_addRDFGrabber(self, id, title, rdfurls, http_proxy, namespaces, interval, REQUEST=None):
"""Create an object and install it in its parent Folder.
The argument 'self' will be bound to the parent Folder.
"""
grabber = RDFGrabber(id, title, rdfurls, http_proxy, namespaces, interval )
self._setObject(id, grabber)
if REQUEST is not None:
return self.manage_main(self, REQUEST)
manage_addRDFGrabberForm = Globals.DTMLFile('add_grabber', globals())