[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[GNUnet-SVN] r16745 - Extractor-python
From: |
gnunet |
Subject: |
[GNUnet-SVN] r16745 - Extractor-python |
Date: |
Fri, 9 Sep 2011 21:13:37 +0200 |
Author: grothoff
Date: 2011-09-09 21:13:37 +0200 (Fri, 09 Sep 2011)
New Revision: 16745
Modified:
Extractor-python/ChangeLog
Extractor-python/README
Extractor-python/extract.py
Extractor-python/extractor.py
Extractor-python/setup.py
Log:
updating binding to work with LE 0.6
Modified: Extractor-python/ChangeLog
===================================================================
--- Extractor-python/ChangeLog 2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/ChangeLog 2011-09-09 19:13:37 UTC (rev 16745)
@@ -1,8 +1,9 @@
+Fri Sep 9 21:09:47 CEST 2011
+ Updated Python binding to GNU libextractor 0.6-API. -CG
+
Sun May 28 00:42:00 CEST 2006
Rewritten using ctypes.
-
+
Mon Jul 4 17:19:33 CEST 2005
Moved python binding into separate package.
-
-
Modified: Extractor-python/README
===================================================================
--- Extractor-python/README 2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/README 2011-09-09 19:13:37 UTC (rev 16745)
@@ -18,8 +18,8 @@
* python >= 2.3
web site: http://www.python.org/
- * libextractor > 0.5
- web site: http://gnunet.org/libextractor
+ * libextractor >= 0.6
+ web site: http://www.gnu.org/software/libextractor/
* ctypes >= 0.9
web site: http://starship.python.net/crew/theller/ctypes/
@@ -62,6 +62,7 @@
=========
Copyright (C) 2006 Bader Ladjemi <address@hidden>
+ Copyright (C) 2011 Christian Grothoff <address@hidden>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Modified: Extractor-python/extract.py
===================================================================
--- Extractor-python/extract.py 2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/extract.py 2011-09-09 19:13:37 UTC (rev 16745)
@@ -24,10 +24,22 @@
"""
import extractor
import sys
+from ctypes import *
+import struct
xtract = extractor.Extractor()
+
+def print_k(xt, plugin, type, format, mime, data, datalen):
+ mstr = cast (data, c_char_p)
+# FIXME: this ignores 'datalen', not that great...
+# (in general, depending on the mime type and format, only
+# the first 'datalen' bytes in 'data' should be used).
+ if (format == extractor.EXTRACTOR_METAFORMAT_UTF8):
+ print "%s - %s" % (xtract.keywordTypes()[type], mstr.value)
+ return 0
+
+
for arg in sys.argv[1:]:
print "Keywords from %s:" % arg
- keys = xtract.extract(arg)
- for keyword_type, keyword in keys:
- print "%s - %s" % (keyword_type.encode('iso-8859-1'),
keyword.encode('iso-8859-1'))
+ xtract.extract(print_k, None, arg)
+
Modified: Extractor-python/extractor.py
===================================================================
--- Extractor-python/extractor.py 2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/extractor.py 2011-09-09 19:13:37 UTC (rev 16745)
@@ -2,6 +2,7 @@
## Python bindings for GNU libextractor
##
## Copyright (C) 2006 Bader Ladjemi <address@hidden>
+## Copyright (C) 2011 Christian Grothoff <address@hidden>
##
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
@@ -34,57 +35,35 @@
#fake cdll import
try:
#loading shared object file
- libextractor = cdll.LoadLibrary('libextractor.so.1')
+ libextractor = cdll.LoadLibrary('libextractor.so.3')
except OSError:
libextractor = cdll.extractor
-__all__ = ['Extractor', 'isBinaryType', 'EXTRACTOR_ENCODING',
'DEFAULT_LIBRARIES', 'EXTRACTOR_THUMBNAIL_DATA']
-__version__ = "0.5"
+__all__ = ['Extractor']
+__version__ = "0.6"
__licence__ = "GNU GPL"
"""
keyword's charset encoding
"""
-EXTRACTOR_ENCODING = "utf-8"
-
KeywordType = c_int
-Keywords_p = POINTER('Keywords')
-class Keywords(Structure):
- """
- EXTRACTOR_Keywords struct
- """
- _fields_ = [('keyword', c_char_p),
- ('keywordType', KeywordType),
- ('next', Keywords_p)]
-SetPointerType(Keywords_p, Keywords)
+MetaType = c_int
-KEYWORDS = POINTER(Keywords)
+EXTRACT_CB = CFUNCTYPE(c_int, c_void_p, c_char_p, KeywordType, MetaType,
c_char_p, c_void_p, c_size_t)
-libextractor.EXTRACTOR_getKeywords.restype = KEYWORDS
-libextractor.EXTRACTOR_getKeywords2.restype = KEYWORDS
-libextractor.EXTRACTOR_removeDuplicateKeywords.restype = KEYWORDS
-libextractor.EXTRACTOR_getKeywordTypeAsString.restype = c_char_p
+libextractor.EXTRACTOR_metatype_get_max.restype = KeywordType
+libextractor.EXTRACTOR_metatype_to_description.restype = c_char_p
+libextractor.EXTRACTOR_metatype_to_string.restype = c_char_p
+libextractor.EXTRACTOR_plugin_add_defaults.restype = c_void_p
+libextractor.EXTRACTOR_extract.argtypes = [c_void_p, c_char_p, c_void_p,
c_size_t, EXTRACT_CB, c_void_p]
-libextractor.EXTRACTOR_getDefaultLibraries.restype = c_char_p
-"""
-thumbnail keyword type (binary)
-"""
-EXTRACTOR_THUMBNAIL_DATA = 70
+EXTRACTOR_METAFORMAT_UNKNOWN = 0
+EXTRACTOR_METAFORMAT_UTF8 = 1
+EXTRACTOR_METAFORMAT_BINARY = 2
+EXTRACTOR_METAFORMAT_C_STRING = 3
-def isBinaryType(keyword_type):
- """
- returns if the given keyword_type is binary
- @param keyword_type: keyword type (int)
- """
- return keyword_type == EXTRACTOR_THUMBNAIL_DATA
-
-"""
-default loaded libraries
-"""
-DEFAULT_LIBRARIES = libextractor.EXTRACTOR_getDefaultLibraries().split(':')
-
class Extractor(object):
"""
Main class for extracting meta-data with GNU libextractor.
@@ -99,177 +78,37 @@
libraries that should be used.
"""
- def __init__(self, defaults=True, libraries=None, lang=None,
languages=None, hash=None, use_filename=False, split_keywords=False):
+ def __init__(self, defaults=True, libraries=None):
"""
Initialize Extractor's instance
- @param extractors: list of strings that contains extractor's name
(supported types)
+ @param libraries: list of strings that contains extractor's name
(supported types)
@param defaults: load default plugins
- @param lang: use the generic plaintext extractor for the language with
the 2-letter language code LANG
- @param languages: list of lang
- @param hash: compute hash using the given algorithm (currently 'sha1'
or 'md5')
- @param use_filename: use the filename as a keyword (add
filename-extractor library)
- @param split_keywords: use keyword splitting (add split-extractor
library)
- >>> Extractor() #doctest: +ELLIPSIS
- <__main__.Extractor object at 0x...>
-
- >>> extractor = Extractor(defaults=False)
- >>> extractor.libraries
- ()
-
- >>> extractor = Extractor()
- >>> sorted(extractor.libraries) == sorted(tuple(DEFAULT_LIBRARIES))
- True
-
- >>> extractor = Extractor(hash='md5')
- >>> found = False
- >>> for library in extractor.libraries:
- ... if 'md5' in library:
- ... found = True
- ... break
- >>> found
- True
-
- >>> extractor = Extractor(use_filename=True)
- >>> found = False
- >>> for library in extractor.libraries:
- ... if 'filename' in library:
- ... found = True
- ... break
- >>> found
- True
-
- >>> extractor = Extractor(split_keywords=True)
- >>> found = False
- >>> for library in extractor.libraries:
- ... if 'split' in library:
- ... found = True
- ... break
- >>> found
- True
-
"""
- self._libraries = {}
self.extractors = None
if defaults:
- self.extractors = libextractor.EXTRACTOR_loadDefaultLibraries()
- self._libraries = dict([(library, None) for library in
DEFAULT_LIBRARIES])
- if use_filename:
- self.addLibrary("libextractor_filename")
+ self.extractors = libextractor.EXTRACTOR_plugin_add_defaults(0)
if libraries:
- self.extractors =
libextractor.EXTRACTOR_loadConfigLibraries(self.extractors, libraries)
- self._libraries.update(dict([(library, None) for library in
libraries.split(':')]))
- if isinstance(lang, str):
- self.addLibraryLast("libextractor_printable_%s" % lang)
- if isinstance(hash, str):
- self.addLibraryLast("libextractor_hash_%s" % hash)
- if languages:
- [self.addLibraryLast("libextractor_printable_%s" % language) for
language in languages]
- if split_keywords:
- self.addLibraryLast("libextractor_split")
+ self.extractors = libextractor.EXTRACTOR_plugin_add_config
(self.extractors, libraries, 0)
- def extract(self, filename=None, data=None, size=None):
+ def extract(self, proc, proc_cls, filename=None, data=None, size=0):
"""Extract keywords from a file, or from its data.
@param filename: filename string
@param data: data contents
@param size: data size
+ @param proc: function to call on each value
+ @param proc_cls: closure to proc
- This function returns a list of tuples. Its first value is keyword type
- and its second value is keyword value. If the file cannot be opened
- or cannot be found, the list will be empty. The list can
- also be empty if no keyword was found for the file.
+ If you give data, size has to be given as well.
- If you give data, size had to be given too.
-
"""
if not filename and not (data and size):
return None
- elif filename:
- return self.extractFromFile(filename)
else:
- return self.extractFromData(data, size)
+ libextractor.EXTRACTOR_extract (self.extractors, filename, data,
size, EXTRACT_CB(proc), proc_cls)
- def extractFromFile(self, filename):
- """Extract keywords from a file using its filename.
-
- @param filename: filename string
-
- This function returns a list of tuples. Its first value is keyword type
- and its second value is keyword value. If the file cannot be opened
- or cannot be found, the list will be empty. The list can
- also be empty if no keyword was found for the file.
-
- >>> import os
- >>> extractor = Extractor()
- >>> filename = os.tmpnam()
- >>> f = file(filename, 'w')
- >>> extractor.extract(filename)
- []
-
- >>> import os
- >>> extractor = Extractor()
- >>> filename = '../Extractor/test/test.png'
- >>> extractor.extract(filename)
- [(u'comment', u'Testing keyword extraction\\n'),
(u'resource-identifier', u'dc6c58c971715e8043baef058b675eec'), (u'size',
u'4x4'), (u'mimetype', u'image/png')]
-
- >>> import os, glob
- >>> extractor = Extractor()
- >>> filename = glob.glob('dist/*.gz')[0]
- >>> extracted = extractor.extract(filename)
- >>> filename_count = 0
- >>> for keyword_type, keyword in extracted:
- ... if keyword_type == 'filename':
- ... filename_count += 1
- >>> filename_count > 1
- True
-
- """
- self.keywords_p = libextractor.EXTRACTOR_getKeywords(self.extractors,
filename)
- return self._extract()
-
- def extractFromData(self, data, size):
- """Extract keywords using its data.
-
- @param data: data contents
- @param size: data size
-
- This function returns a list of tuples. Its first value is keyword type
- and its second value is keyword value. If the file cannot be opened
- or cannot be found, the list will be empty. The list can
- also be empty if no keyword was found for the file.
-
- """
- self.keywords_p = libextractor.EXTRACTOR_getKeywords2(self.extractors,
data, size)
- return self._extract()
-
- def _extract(self):
- self.extracted = []
-
- if not self.keywords_p:
- return self.extracted
-
- try:
- self.keywords = self.keywords_p.contents
- except ValueError:
- return self.extracted
-
- while True:
- keyword_type =
libextractor.EXTRACTOR_getKeywordTypeAsString(self.keywords.keywordType).decode(EXTRACTOR_ENCODING)
- keyword = self.keywords.keyword
-
- if not isBinaryType(self.keywords.keywordType):
- keyword = keyword.decode(EXTRACTOR_ENCODING)
-
- self.extracted.append((keyword_type, keyword))
- try:
- self.keywords = self.keywords.next.contents
- except ValueError:
- libextractor.EXTRACTOR_freeKeywords(self.keywords_p)
- self.keywords_p = None
- return self.extracted
-
def addLibrary(self, library):
"""
Add given library to the extractor. Invoke with a string with the name
@@ -285,20 +124,8 @@
@param library: library's name
"""
- self._libraries[library] = None
+ self.extractors = libextractor.EXTRACTOR_plugin_add (self.extractors,
library, NULL, 0)
- self.extractors = libextractor.EXTRACTOR_addLibrary(self.extractors,
library)
-
- def addLibraryLast(self, library):
- """
- Same as addLibrary but the library is added at the last.
-
- @param library: library's name
- """
- self._libraries[library] = None
-
- self.extractors =
libextractor.EXTRACTOR_addLibraryLast(self.extractors, library)
-
def removeLibrary(self, library):
"""
Remove a library. Pass the name of the library that is to
@@ -312,13 +139,9 @@
@param library: library's name
"""
- try:
- del self._libraries[library]
- except KeyError:
- raise ValueError, "No such loaded library"
-
- self.extractors = libextractor.EXTRACTOR_removeLibrary(self.extractors,
library)
+ self.extractors = libextractor.EXTRACTOR_plugin_remove(self.extractors,
library)
+
def addLibraries(self, libraries):
"""
Add given libraries.
@@ -326,38 +149,29 @@
@param libraries: list of libraries names
"""
- for library in libraries:
- if isinstance(library, str):
- self.addLibrary(library)
+ self.extractors =
libextractor.EXTRACTOR_plugin_add_config(self.extractors, libraries)
+
def removeAllLibraries(self):
"""
Remove all libraries.
- >>> extractor = Extractor()
- >>> extractor.removeAllLibraries()
- >>> extractor.libraries
- ()
"""
- self._libraries = {}
- if self.extractors:
- libextractor.EXTRACTOR_removeAll(self.extractors)
- self.extractors = None
+
+ libextractor.EXTRACTOR_plugin_remove_all(self.extractors)
+ self.extractors = None
def keywordTypes(self):
"""
Returns the list of all keywords types.
@return: list of all keywords types
- >>> extractor = Extractor()
- >>> extractor.keywordTypes()
- ('unknown', 'filename', 'mimetype', 'title', 'author', 'artist',
'description', 'comment', 'date', 'publisher', 'language', 'album', 'genre',
'location', 'version', 'organization', 'copyright', 'subject', 'keywords',
'contributor', 'resource-type', 'format', 'resource-identifier', 'source',
'relation', 'coverage', 'software', 'disclaimer', 'warning', 'translated',
'creation date', 'modification date', 'creator', 'producer', 'page count',
'page orientation', 'paper size', 'used fonts', 'page order', 'created for',
'magnification', 'release', 'group', 'size', 'summary', 'packager', 'vendor',
'license', 'distribution', 'build-host', 'os', 'dependency', 'MD4', 'MD5',
'SHA-0', 'SHA-1', 'RipeMD160', 'resolution', 'category', 'book title',
'priority', 'conflicts', 'replaces', 'provides', 'conductor', 'interpreter',
'owner', 'lyrics', 'media type', 'contact', 'binary thumbnail data',
'publication date', 'camera make', 'camera model', 'exposure', 'aperture',
'exposure bias', 'flash'
, 'flash bias', 'focal length', 'focal length (35mm equivalent)', 'iso speed',
'exposure mode', 'metering mode', 'macro mode', 'image quality', 'white
balance', 'orientation')
"""
i = 0
keyword_types = []
while True:
- keyword_type = libextractor.EXTRACTOR_getKeywordTypeAsString(i)
+ keyword_type = libextractor.EXTRACTOR_metatype_to_string(i)
if not keyword_type:
break
keyword_types.append(keyword_type)
@@ -365,31 +179,7 @@
return tuple(keyword_types)
- def _get_libraries(self):
- """
- Return current libraries
- @return: current libraries
- """
- return tuple(self._libraries.keys())
- def _set_libraries(self, libraries):
- """
- Add libraries to load (don't replace current ones)
-
- @param libraries: list of libraries
-
- >>> extractor = Extractor()
- >>> extractor.libraries = ('libextractor_filename', )
- >>> 'libextractor_filename' in extractor.libraries
- True
- >>> len(extractor.libraries) == len(DEFAULT_LIBRARIES)+1
- True
-
- """
- self.addLibraries(libraries)
-
- libraries = property(fget=_get_libraries, fset=_set_libraries,
fdel=removeAllLibraries, doc='tuple of loaded libraries')
-
def __del__(self):
"""
>>> extractor = Extractor()
Modified: Extractor-python/setup.py
===================================================================
--- Extractor-python/setup.py 2011-09-09 18:13:19 UTC (rev 16744)
+++ Extractor-python/setup.py 2011-09-09 19:13:37 UTC (rev 16745)
@@ -6,7 +6,7 @@
setup(
name = "Extractor",
- version = "0.5",
+ version = "0.6",
py_modules = ['extractor'],
scripts = ['extract.py'],
@@ -20,9 +20,6 @@
license = "GNU GPL",
keywords = "libextractor binding tag metadata",
url = "http://gnunet.org/libextractor/",
-
- dependency_links=['http://starship.python.net/crew/theller/ctypes/',],
-
long_description="""libextractor is a simple library for keyword
extraction. libextractor
does not support all formats but supports a simple plugging mechanism
such that you can quickly add extractors for additional formats, even
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- [GNUnet-SVN] r16745 - Extractor-python,
gnunet <=