gnunet-svn
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[www] 01/02: better text extraction.


From: gnunet
Subject: [www] 01/02: better text extraction.
Date: Mon, 11 Nov 2019 22:18:35 +0100

This is an automated email from the git hooks/post-receive script.

ng0 pushed a commit to branch master
in repository www.

commit 97beb6b2a9d0722112f6de9636fe5a21fb76af76
Author: ng0 <address@hidden>
AuthorDate: Mon Nov 11 21:15:15 2019 +0000

    better text extraction.
---
 template.py | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/template.py b/template.py
index e2a689b..89e4856 100755
--- a/template.py
+++ b/template.py
@@ -33,8 +33,8 @@ from pathlib import Path
 import hashlib
 from bs4 import BeautifulSoup
 from ruamel.yaml import YAML
+import html.parser
 
-# TODO: Turn repetition into a class.
 
 env = jinja2.Environment(loader=jinja2.FileSystemLoader(
     os.path.dirname(__file__)),
@@ -45,6 +45,22 @@ env = jinja2.Environment(loader=jinja2.FileSystemLoader(
                          autoescape=False)
 
 
+class extractText(html.parser.HTMLParser):
+    def __init__(self):
+        super(extractText, self).__init__()
+        self.result = []
+    def handle_data(self, data):
+        self.result.append(data)
+    def text_in(self):
+        return ''.join(self.result)
+
+
+def html2text(html):
+    k = extractText()
+    k.feed(html)
+    return k.text_in()
+
+
 def localized(filename, locale, *args):
     if len(args) == 0:
         return "../" + locale + "/" + filename
@@ -165,7 +181,7 @@ def preview_text(filename, count):
         for i in soup.findAll('p')[1]:
             k.append(i)
         b = ''.join(str(e) for e in k)
-        text = b.replace("\n", "")
+        text = html2text(b.replace("\n", ""))
         textreduced = (text[:count] + '...') if len(text) > count else (text + 
'..')
         return(textreduced)
 
@@ -271,7 +287,7 @@ def main():
     conf=yaml.load(site_configfile)
 
     for item in conf["newsposts"]:
-        item['abstract'] = abstract_news(item['page'], 300)
+        item['abstract'] = abstract_news(item['page'], 1000)
     print("generating template")
     generate_site("template", conf)
     print("generating news")

-- 
To stop receiving notification emails like this one, please contact
address@hidden.



reply via email to

[Prev in Thread] Current Thread [Next in Thread]