[GNUnet-SVN] r16745 - Extractor-python

gnunet-svn
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r16745 - Extractor-python

From:	gnunet
Subject:	[GNUnet-SVN] r16745 - Extractor-python
Date:	Fri, 9 Sep 2011 21:13:37 +0200
Author: grothoff
Date: 2011-09-09 21:13:37 +0200 (Fri, 09 Sep 2011)
New Revision: 16745

Modified:
   Extractor-python/ChangeLog
   Extractor-python/README
   Extractor-python/extract.py
   Extractor-python/extractor.py
   Extractor-python/setup.py
Log:
updating binding to work with LE 0.6

Modified: Extractor-python/ChangeLog
===================================================================
--- Extractor-python/ChangeLog  2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/ChangeLog  2011-09-09 19:13:37 UTC (rev 16745)
@@ -1,8 +1,9 @@
+Fri Sep  9 21:09:47 CEST 2011
+       Updated Python binding to GNU libextractor 0.6-API. -CG
+
 Sun May        28 00:42:00 CEST 2006
        Rewritten using ctypes.
-       
+
 Mon Jul  4 17:19:33 CEST 2005
        Moved python binding into separate package.
 
-       
-       

Modified: Extractor-python/README
===================================================================
--- Extractor-python/README     2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/README     2011-09-09 19:13:37 UTC (rev 16745)
@@ -18,8 +18,8 @@
  * python >= 2.3
     web site: http://www.python.org/
 
- * libextractor > 0.5
-    web site: http://gnunet.org/libextractor
+ * libextractor >= 0.6
+    web site: http://www.gnu.org/software/libextractor/
 
  * ctypes >= 0.9 
     web site: http://starship.python.net/crew/theller/ctypes/
@@ -62,6 +62,7 @@
 =========
  
  Copyright (C) 2006 Bader Ladjemi <address@hidden>
+ Copyright (C) 2011 Christian Grothoff <address@hidden>
 
  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by

Modified: Extractor-python/extract.py
===================================================================
--- Extractor-python/extract.py 2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/extract.py 2011-09-09 19:13:37 UTC (rev 16745)
@@ -24,10 +24,22 @@
 """
 import extractor
 import sys
+from ctypes import *
+import struct
 
 xtract = extractor.Extractor()
+
+def print_k(xt, plugin, type, format, mime, data, datalen):
+    mstr = cast (data, c_char_p)
+# FIXME: this ignores 'datalen', not that great...
+# (in general, depending on the mime type and format, only
+# the first 'datalen' bytes in 'data' should be used).
+    if (format == extractor.EXTRACTOR_METAFORMAT_UTF8):
+        print "%s - %s" % (xtract.keywordTypes()[type],  mstr.value)
+    return 0
+
+
 for arg in sys.argv[1:]:
     print "Keywords from %s:" % arg
-    keys = xtract.extract(arg)
-    for keyword_type, keyword in keys:
-        print "%s - %s" % (keyword_type.encode('iso-8859-1'), 
keyword.encode('iso-8859-1'))
+    xtract.extract(print_k, None, arg)
+

Modified: Extractor-python/extractor.py
===================================================================
--- Extractor-python/extractor.py       2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/extractor.py       2011-09-09 19:13:37 UTC (rev 16745)
@@ -2,6 +2,7 @@
 ## Python bindings for GNU libextractor
 ## 
 ## Copyright (C) 2006 Bader Ladjemi <address@hidden>
+## Copyright (C) 2011 Christian Grothoff <address@hidden>
 ##
 ## This program is free software; you can redistribute it and/or modify
 ## it under the terms of the GNU General Public License as published by
@@ -34,57 +35,35 @@
 #fake cdll import
 try:
     #loading shared object file
-    libextractor = cdll.LoadLibrary('libextractor.so.1')
+    libextractor = cdll.LoadLibrary('libextractor.so.3')
 except OSError:
     libextractor = cdll.extractor
  
-__all__ = ['Extractor', 'isBinaryType', 'EXTRACTOR_ENCODING', 
'DEFAULT_LIBRARIES', 'EXTRACTOR_THUMBNAIL_DATA']
-__version__ = "0.5"
+__all__ = ['Extractor']
+__version__ = "0.6"
 __licence__ = "GNU GPL"
 
 """
 keyword's charset encoding
 """
-EXTRACTOR_ENCODING = "utf-8"
-
 KeywordType = c_int
-Keywords_p = POINTER('Keywords')
-class Keywords(Structure):
-    """
-    EXTRACTOR_Keywords struct
-    """
-    _fields_ = [('keyword', c_char_p),
-               ('keywordType', KeywordType),
-               ('next', Keywords_p)]   
-SetPointerType(Keywords_p, Keywords)
+MetaType = c_int
 
-KEYWORDS = POINTER(Keywords)
+EXTRACT_CB = CFUNCTYPE(c_int, c_void_p, c_char_p, KeywordType, MetaType, 
c_char_p, c_void_p, c_size_t)
 
-libextractor.EXTRACTOR_getKeywords.restype = KEYWORDS
-libextractor.EXTRACTOR_getKeywords2.restype = KEYWORDS
-libextractor.EXTRACTOR_removeDuplicateKeywords.restype = KEYWORDS
-libextractor.EXTRACTOR_getKeywordTypeAsString.restype = c_char_p
+libextractor.EXTRACTOR_metatype_get_max.restype = KeywordType
+libextractor.EXTRACTOR_metatype_to_description.restype = c_char_p
+libextractor.EXTRACTOR_metatype_to_string.restype = c_char_p
+libextractor.EXTRACTOR_plugin_add_defaults.restype = c_void_p
+libextractor.EXTRACTOR_extract.argtypes = [c_void_p, c_char_p, c_void_p, 
c_size_t, EXTRACT_CB, c_void_p]
 
-libextractor.EXTRACTOR_getDefaultLibraries.restype = c_char_p
 
-"""
-thumbnail keyword type (binary)
-"""
-EXTRACTOR_THUMBNAIL_DATA = 70
+EXTRACTOR_METAFORMAT_UNKNOWN = 0
+EXTRACTOR_METAFORMAT_UTF8 = 1
+EXTRACTOR_METAFORMAT_BINARY = 2
+EXTRACTOR_METAFORMAT_C_STRING = 3
 
-def isBinaryType(keyword_type):
-    """
-    returns if the given keyword_type is binary
 
-    @param keyword_type: keyword type (int)
-    """
-    return keyword_type == EXTRACTOR_THUMBNAIL_DATA
-
-"""
-default loaded libraries
-"""
-DEFAULT_LIBRARIES = libextractor.EXTRACTOR_getDefaultLibraries().split(':')
-
 class Extractor(object):
     """
     Main class for extracting meta-data with GNU libextractor.
@@ -99,177 +78,37 @@
     libraries that should be used.
     """
     
-    def __init__(self, defaults=True, libraries=None, lang=None, 
languages=None, hash=None, use_filename=False, split_keywords=False):
+    def __init__(self, defaults=True, libraries=None):
        """
        Initialize Extractor's instance
        
-       @param extractors: list of strings that contains extractor's name 
(supported types)
+       @param libraries: list of strings that contains extractor's name 
(supported types)
        @param defaults: load default plugins
-       @param lang: use the generic plaintext extractor for the language with 
the 2-letter language code LANG
-       @param languages: list of lang
-       @param hash: compute hash using the given algorithm (currently 'sha1' 
or 'md5')
-       @param use_filename: use the filename as a keyword (add 
filename-extractor library)
-       @param split_keywords: use keyword splitting (add split-extractor 
library)
 
-       >>> Extractor() #doctest: +ELLIPSIS
-       <__main__.Extractor object at 0x...>
-       
-       >>> extractor = Extractor(defaults=False)
-       >>> extractor.libraries
-       ()
-
-       >>> extractor = Extractor()
-       >>> sorted(extractor.libraries) == sorted(tuple(DEFAULT_LIBRARIES))
-       True
-
-       >>> extractor = Extractor(hash='md5')
-       >>> found = False
-       >>> for library in extractor.libraries:
-       ...     if 'md5' in library:
-       ...        found = True
-       ...        break
-       >>> found
-       True
-
-       >>> extractor = Extractor(use_filename=True)
-       >>> found = False
-       >>> for library in extractor.libraries:
-       ...     if 'filename' in library:
-       ...        found = True
-       ...        break
-       >>> found
-       True
-
-       >>> extractor = Extractor(split_keywords=True)
-       >>> found = False
-       >>> for library in extractor.libraries:
-       ...     if 'split' in library:
-       ...        found = True
-       ...        break
-       >>> found
-       True
-
        """
-       self._libraries = {}
        self.extractors = None
        if defaults:
-           self.extractors = libextractor.EXTRACTOR_loadDefaultLibraries()
-           self._libraries = dict([(library, None) for library in 
DEFAULT_LIBRARIES])
-       if use_filename:
-           self.addLibrary("libextractor_filename")
+           self.extractors = libextractor.EXTRACTOR_plugin_add_defaults(0)
        if libraries:
-           self.extractors = 
libextractor.EXTRACTOR_loadConfigLibraries(self.extractors, libraries)
-           self._libraries.update(dict([(library, None) for library in 
libraries.split(':')]))
-       if isinstance(lang, str):
-           self.addLibraryLast("libextractor_printable_%s" % lang)
-       if isinstance(hash, str):
-           self.addLibraryLast("libextractor_hash_%s" % hash)
-       if languages:
-           [self.addLibraryLast("libextractor_printable_%s" % language) for 
language in languages]
-       if split_keywords:
-           self.addLibraryLast("libextractor_split")
+           self.extractors = libextractor.EXTRACTOR_plugin_add_config 
(self.extractors, libraries, 0)
     
-    def extract(self, filename=None, data=None, size=None):
+    def extract(self, proc, proc_cls, filename=None, data=None, size=0):
        """Extract keywords from a file, or from its data.
 
        @param filename: filename string
        @param data: data contents
        @param size: data size
+        @param proc: function to call on each value
+        @param proc_cls: closure to proc
        
-        This function returns a list of tuples. Its first value is keyword type
-       and its second value is keyword value. If the file cannot be opened
-       or cannot be found, the list will be empty.  The list can
-       also be empty if no keyword was found for the file.
+       If you give data, size has to be given as well.
 
-       If you give data, size had to be given too.
-
         """
        if not filename and not (data and size):
            return None
-       elif filename:
-           return self.extractFromFile(filename)
        else:
-           return self.extractFromData(data, size)
+           libextractor.EXTRACTOR_extract (self.extractors, filename, data, 
size, EXTRACT_CB(proc), proc_cls)
        
-    def extractFromFile(self, filename):
-       """Extract keywords from a file using its filename.
-
-       @param filename: filename string
-       
-        This function returns a list of tuples. Its first value is keyword type
-       and its second value is keyword value. If the file cannot be opened
-       or cannot be found, the list will be empty.  The list can
-       also be empty if no keyword was found for the file.
-
-       >>> import os
-       >>> extractor = Extractor()
-       >>> filename = os.tmpnam()
-       >>> f = file(filename, 'w')
-       >>> extractor.extract(filename)
-       []
-
-       >>> import os
-       >>> extractor = Extractor()
-       >>> filename = '../Extractor/test/test.png'
-       >>> extractor.extract(filename)
-       [(u'comment', u'Testing keyword extraction\\n'), 
(u'resource-identifier', u'dc6c58c971715e8043baef058b675eec'), (u'size', 
u'4x4'), (u'mimetype', u'image/png')]
-
-       >>> import os, glob
-       >>> extractor = Extractor()
-       >>> filename = glob.glob('dist/*.gz')[0]
-       >>> extracted = extractor.extract(filename)
-       >>> filename_count = 0
-       >>> for keyword_type, keyword in extracted:
-        ...     if keyword_type == 'filename':
-       ...        filename_count += 1
-       >>> filename_count > 1
-       True
-
-        """
-       self.keywords_p = libextractor.EXTRACTOR_getKeywords(self.extractors, 
filename)
-       return self._extract()
-
-    def extractFromData(self, data, size):
-       """Extract keywords using its data.
-
-       @param data: data contents
-       @param size: data size
-       
-        This function returns a list of tuples. Its first value is keyword type
-       and its second value is keyword value. If the file cannot be opened
-       or cannot be found, the list will be empty.  The list can
-       also be empty if no keyword was found for the file.
-
-        """
-       self.keywords_p = libextractor.EXTRACTOR_getKeywords2(self.extractors, 
data, size)
-       return self._extract()
-    
-    def _extract(self):
-       self.extracted = []
-
-       if not self.keywords_p:
-           return self.extracted
-
-       try:
-           self.keywords = self.keywords_p.contents
-       except ValueError:
-           return self.extracted
-
-       while True:
-           keyword_type = 
libextractor.EXTRACTOR_getKeywordTypeAsString(self.keywords.keywordType).decode(EXTRACTOR_ENCODING)
-           keyword = self.keywords.keyword
-           
-           if not isBinaryType(self.keywords.keywordType):
-               keyword = keyword.decode(EXTRACTOR_ENCODING)
-               
-           self.extracted.append((keyword_type, keyword))
-           try:
-               self.keywords = self.keywords.next.contents
-           except ValueError:
-               libextractor.EXTRACTOR_freeKeywords(self.keywords_p)
-               self.keywords_p = None
-               return self.extracted
-           
     def addLibrary(self, library):
        """
         Add given library to the extractor. Invoke with a string with the name
@@ -285,20 +124,8 @@
 
        @param library: library's name
         """    
-       self._libraries[library] = None
+       self.extractors = libextractor.EXTRACTOR_plugin_add (self.extractors, 
library, NULL, 0)
 
-       self.extractors = libextractor.EXTRACTOR_addLibrary(self.extractors, 
library)
-
-    def addLibraryLast(self, library):
-       """
-       Same as addLibrary but the library is added at the last.
-
-       @param library: library's name
-       """
-       self._libraries[library] = None
-       
-       self.extractors = 
libextractor.EXTRACTOR_addLibraryLast(self.extractors, library)
-
     def removeLibrary(self, library):
        """      
         Remove a library.  Pass the name of the library that is to
@@ -312,13 +139,9 @@
 
        @param library: library's name
        """
-       try:
-           del self._libraries[library]
-       except KeyError:
-           raise ValueError, "No such loaded library"
-       
-       self.extractors = libextractor.EXTRACTOR_removeLibrary(self.extractors, 
library)
 
+       self.extractors = libextractor.EXTRACTOR_plugin_remove(self.extractors, 
library)
+
     def addLibraries(self, libraries):
        """
        Add given libraries. 
@@ -326,38 +149,29 @@
 
        @param libraries: list of libraries names
        """
-       for library in libraries:
-           if isinstance(library, str):
-               self.addLibrary(library)
 
+       self.extractors = 
libextractor.EXTRACTOR_plugin_add_config(self.extractors, libraries)
+
     def removeAllLibraries(self):
        """
        Remove all libraries.
 
-       >>> extractor = Extractor()
-       >>> extractor.removeAllLibraries()
-       >>> extractor.libraries
-       ()
        """
-       self._libraries = {}
-       if self.extractors:
-           libextractor.EXTRACTOR_removeAll(self.extractors)
-           self.extractors = None
+
+        libextractor.EXTRACTOR_plugin_remove_all(self.extractors)
+        self.extractors = None
        
     def keywordTypes(self):
        """
        Returns the list of all keywords types.
        @return: list of all keywords types
 
-       >>> extractor = Extractor()
-       >>> extractor.keywordTypes()
-       ('unknown', 'filename', 'mimetype', 'title', 'author', 'artist', 
'description', 'comment', 'date', 'publisher', 'language', 'album', 'genre', 
'location', 'version', 'organization', 'copyright', 'subject', 'keywords', 
'contributor', 'resource-type', 'format', 'resource-identifier', 'source', 
'relation', 'coverage', 'software', 'disclaimer', 'warning', 'translated', 
'creation date', 'modification date', 'creator', 'producer', 'page count', 
'page orientation', 'paper size', 'used fonts', 'page order', 'created for', 
'magnification', 'release', 'group', 'size', 'summary', 'packager', 'vendor', 
'license', 'distribution', 'build-host', 'os', 'dependency', 'MD4', 'MD5', 
'SHA-0', 'SHA-1', 'RipeMD160', 'resolution', 'category', 'book title', 
'priority', 'conflicts', 'replaces', 'provides', 'conductor', 'interpreter', 
'owner', 'lyrics', 'media type', 'contact', 'binary thumbnail data', 
'publication date', 'camera make', 'camera model', 'exposure', 'aperture', 
'exposure bias', 'flash'
 , 'flash bias', 'focal length', 'focal length (35mm equivalent)', 'iso speed', 
'exposure mode', 'metering mode', 'macro mode', 'image quality', 'white 
balance', 'orientation')
        """
        i = 0
        keyword_types = []
        
        while True:
-           keyword_type = libextractor.EXTRACTOR_getKeywordTypeAsString(i)
+           keyword_type = libextractor.EXTRACTOR_metatype_to_string(i)
            if not keyword_type:
                break
            keyword_types.append(keyword_type)
@@ -365,31 +179,7 @@
            
        return tuple(keyword_types)
     
-    def _get_libraries(self):
-       """
-       Return current libraries
-       @return: current libraries
-       """
-       return tuple(self._libraries.keys())
 
-    def _set_libraries(self, libraries):
-       """
-       Add libraries to load (don't replace current ones)
-
-       @param libraries: list of libraries
-       
-       >>> extractor = Extractor()
-       >>> extractor.libraries = ('libextractor_filename', )
-       >>> 'libextractor_filename' in extractor.libraries
-       True
-       >>> len(extractor.libraries) == len(DEFAULT_LIBRARIES)+1
-       True
-       
-       """
-       self.addLibraries(libraries)
-
-    libraries = property(fget=_get_libraries, fset=_set_libraries, 
fdel=removeAllLibraries, doc='tuple of loaded libraries')
-
     def __del__(self):
        """
        >>> extractor = Extractor()

Modified: Extractor-python/setup.py
===================================================================
--- Extractor-python/setup.py   2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/setup.py   2011-09-09 19:13:37 UTC (rev 16745)
@@ -6,7 +6,7 @@
 
 setup(
     name = "Extractor",
-    version = "0.5",
+    version = "0.6",
 
     py_modules = ['extractor'],
     scripts = ['extract.py'],
@@ -20,9 +20,6 @@
     license = "GNU GPL",
     keywords = "libextractor binding tag metadata",
     url = "http://gnunet.org/libextractor/";,  
-
-    dependency_links=['http://starship.python.net/crew/theller/ctypes/',],
-
     long_description="""libextractor is a simple library for keyword 
extraction.  libextractor
 does not support all formats but supports a simple plugging mechanism
 such that you can quickly add extractors for additional formats, even
[Prev in Thread]
Current Thread
[Next in Thread]
[GNUnet-SVN] r16745 - Extractor-python, gnunet <=
Prev by Date: [GNUnet-SVN] r16744 - gnunet/src/dht
Next by Date: [GNUnet-SVN] r16746 - in libmicrohttpd: doc src/include
Previous by thread: [GNUnet-SVN] r16744 - gnunet/src/dht
Next by thread: [GNUnet-SVN] r16746 - in libmicrohttpd: doc src/include
Index(es):
- Date
- Thread