# The contents of this file are subject to the Mozilla Public # License Version 1.1 (the "License"); you may not use this file # except in compliance with the License. You may obtain a copy of # the License at http://www.mozilla.org/MPL/ # # Software distributed under the License is distributed on an "AS # IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or # implied. See the License for the specific language governing # rights and limitations under the License. # # The Original Code is RDFGrabber version 1.0. # # The Initial Developer of the Original Code is European Environment # Agency (EEA). Portions created by EEA are # Copyright (C) European Environment Agency. All # Rights Reserved. # # Contributor(s): # Soren Roug, EEA # import operator, string, time try: import urllib2 # Support for authenticated proxy ulib2 = 1 except: import urllib ulib2 = 0 import xmllib import sys from types import * from objects import resource, literal from const import * from string import join, split from urlparse import urljoin from uuid import uuid1 ns_separator = " " ABOUT_ATTRIBUTE = RDFNS + ns_separator + "about" ID_ATTRIBUTE = RDFNS + ns_separator + "ID" RESOURCE_ATTRIBUTE = RDFNS + ns_separator + "resource" PARSETYPE_ATTRIBUTE = RDFNS + ns_separator + "parseType" EXPECT_RESOURCE = 0 EXPECT_PROPERTY = 1 class Context: def __init__(self,subject,lang,state): self.subject = subject self.lang = lang self.state = state class RDFParser(xmllib.XMLParser): """Parse an RDF file""" def __init__(self,adder,http_proxy=None): self.adder=adder if http_proxy: self._proxies = { 'http': http_proxy } else: self._proxies = {} self.state = None self.encoding = 'UTF-8' self.context = [] self.subject = None self.predicate = [] self.object=None self.li_count = 0 self.anon_count = 0 self.__data = [] self.lang = '' self.elements = { # RDF RDFNS + ns_separator + 'RDF': (self.start_rdf,self.end_rdf), RDFNS + ns_separator + 'Description': (self.start_description,self.end_description), RDFNS + ns_separator + 'Seq': (self.start_container,self.end_container), RDFNS + ns_separator + 'Bag': (self.start_container,self.end_container), RDFNS + ns_separator + 'Alt': (self.start_container,self.end_container), RDFNS + ns_separator + 'li': (self.start_list,self.end_list), } xmllib.XMLParser.__init__(self) def push(self): self.context.append(Context(self.subject, self.lang, self.state)) def pop(self): self.context.pop() self.lang = self.context[-1].lang self.state = self.context[-1].state self.subject = self.context[-1].subject def dupcontext(self): self.context.append(self.context[-1]) def generate_uri(self): "Generate a unique id for an anonymous resource" self.anon_count = self.anon_count + 1 return resource(uuid1().get_urn(), anonymous=1) def xmllang_attr(self,atts): "Check attributes for xml:lang and delete it" if atts.has_key("xml:lang"): self.lang = atts["xml:lang"] del atts["xml:lang"] def about_id_attr(self,atts): if atts.has_key(ABOUT_ATTRIBUTE): self.subject = resource(urljoin(self.baseurl, atts[ABOUT_ATTRIBUTE])) # if len(atts[ABOUT_ATTRIBUTE]) == 0: # self.subject = resource(self.baseurl) # elif atts[ABOUT_ATTRIBUTE].startswith('#'): # self.subject = resource(self.baseurl + atts[ABOUT_ATTRIBUTE]) # else: # self.subject = resource(atts[ABOUT_ATTRIBUTE]) self.push() del atts[ABOUT_ATTRIBUTE] elif atts.has_key(ID_ATTRIBUTE): self.subject = resource(self.baseurl + "#" + atts[ID_ATTRIBUTE]) self.push() del atts[ID_ATTRIBUTE] else: self.subject = self.generate_uri() self.push() def handle_xml(self,encoding,standalone): if(encoding): self.encoding = encoding def unknown_starttag(self, tag, atts): if string.find(tag," ") == -1: if self.baseurl[-1] != '/' and self.baseurl[-1] != '#': tag = self.baseurl + '#' + ns_separator + tag else: tag = self.baseurl + ns_separator + tag if self.state == EXPECT_RESOURCE: self.start_resource(tag, atts) else: self.start_property(tag,atts) def unknown_endtag(self, tag): if string.find(tag," ") == -1: tag = self.baseurl + ns_separator + tag if self.state == EXPECT_PROPERTY: self.end_resource(tag) else: self.end_property(tag) def start_property(self, tag, atts): self.__data = [] self.object = None self.state = EXPECT_RESOURCE self.xmllang_attr(atts) self.push() tag = join(split(tag, ns_separator), "") if atts.has_key(PARSETYPE_ATTRIBUTE): if atts[PARSETYPE_ATTRIBUTE] == "Resource": del atts[PARSETYPE_ATTRIBUTE] newsub = self.generate_uri() self.adder(self.subject,tag,newsub) self.pop() self.subject = newsub self.state = EXPECT_PROPERTY self.push() else: # Only Literal is allowed del atts[PARSETYPE_ATTRIBUTE] self.setliteral() elif atts.has_key(RESOURCE_ATTRIBUTE): self.object = resource(urljoin(self.baseurl, atts[RESOURCE_ATTRIBUTE])) # if atts[RESOURCE_ATTRIBUTE][0] == "#": # self.object = resource(self.baseurl + atts[RESOURCE_ATTRIBUTE]) # else: # self.object = resource(atts[RESOURCE_ATTRIBUTE]) del atts[RESOURCE_ATTRIBUTE] for att in atts.keys(): if att == ID_ATTRIBUTE: # Just in case both about and ID pass else: new_att = resource(join(split(att, ns_separator), "")) self.adder(self.object, new_att, literal(atts[att],self.lang)) def end_property(self, tag): if self.object == None: self.object = literal(string.join(self.__data,""),self.lang) self.pop() type = resource(join(split(tag, ns_separator), "")) self.adder(self.subject, type, self.object) self.object = None def start_resource(self, tag, atts): self.state = EXPECT_PROPERTY self.__data = [] self.object = None self.anon_object=0 tag = join(split(tag, ns_separator), "") self.xmllang_attr(atts) self.about_id_attr(atts) self.adder(self.context[-1].subject, TYPE, resource(tag)) for att in atts.keys(): if att == ID_ATTRIBUTE: # Just in case both about and ID pass else: new_att = resource(join(split(att, ns_separator), "")) self.adder(self.context[-1].subject, new_att, literal(atts[att],self.lang)) def end_resource(self,tag): self.object = self.subject self.pop() def ignore_tag(self,atts=None): pass def handle_data(self, text): self.__data.append(text) def handle_cdata(self, text): self.__data.append(text) def handle_charref(self,ref): self.handle_data('&#' + ref + ';') def unknown_entityref(self,ref): self.handle_data('&' + ref + ';') def syntax_error(self,message): pass def start_rdf(self, tag, atts): "Start of rdf info" self.xmllang_attr(atts) if atts.has_key('xml:base'): self.baseurl = atts['xml:base'] self.state = EXPECT_RESOURCE self.subject = None self.push() def end_rdf(self,tag): pass def start_container(self, tag, atts): self.xmllang_attr(atts) self.about_id_attr(atts) self.li_count = 0 def end_container(self,tag): tag = join(split(tag, ns_separator), "") self.adder(self.subject,TYPE,resource(tag)) self.pop() def start_description(self, tag, atts): """ rdf:Descriptions are like classes, but don't generate a type statement. """ self.__data = [] self.object = None self.state = EXPECT_PROPERTY self.xmllang_attr(atts) self.about_id_attr(atts) for att in atts.keys(): if att == ABOUT_ATTRIBUTE or att == ID_ATTRIBUTE: pass else: new_att = resource(join(split(att, ns_separator), "")) self.adder(self.context[-1].subject, new_att, literal(atts[att],self.lang)) def end_description(self,tag): self.object = self.subject self.pop() def start_list(self, tag, atts): self.__data = [] self.object = None tag = join(split(tag, ns_separator), "") self.li_count = self.li_count + 1 self.predicate.append(resource(RDFNS + '_' + str(self.li_count))) if atts.has_key(RESOURCE_ATTRIBUTE): self.object = resource(urljoin(self.baseurl, atts[RESOURCE_ATTRIBUTE])) # if atts[RESOURCE_ATTRIBUTE][0] == "#": # self.object = resource(self.baseurl + atts[RESOURCE_ATTRIBUTE]) # else: # self.object = resource(atts[RESOURCE_ATTRIBUTE]) del atts[RESOURCE_ATTRIBUTE] for att in atts.keys(): if att == ID_ATTRIBUTE: # Just in case both about and ID pass else: new_att = resource(join(split(att, ns_separator), "")) self.adder(self.predicate[-1], new_att, literal(atts[att],self.lang)) def end_list(self, tag): if self.object == None: self.object = literal(string.join(self.__data,""),self.lang) self.adder(self.subject, self.predicate.pop(), self.object) self.object = self.subject def handle_starttag(self, tag, method, atts): method(tag,atts) def handle_endtag(self, tag, method): method(tag) def parse_url(self, url, if_modified_since=None): """ Grab the file from the webserver and feed it to the parser """ self.encoding = 'UTF-8' self.state = None self.context=[] self.subject = None self.predicate=[] self.object=None self.li_count = 0 self.__data=[] self.lang = '' self.baseurl = url if ulib2 == 1: proxy_support = urllib2.ProxyHandler(self._proxies) opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler,urllib2.FileHandler) urllib2.install_opener(opener) req = urllib2.Request(url=url) if if_modified_since: req.add_header('If-Modified-Since', if_modified_since) f = urllib2.urlopen(req) else: try: u = urllib.URLopener(proxies=self._proxies) except IOError, e: raise IOError, "Unsupported protocol" else: u.addheader("User-agent", "RDFGrabber (helpdesk@eionet.eu.int)") u.addheader("If-Modified-Since", if_modified_since) f = u.open(url) if not f: raise IOError, "Failure in open %s" % url self.rdfsource = f.read() self.feed(self.rdfsource) f.close()