[med-svn] [Git][med-team/python-wordcloud][upstream] New upstream version 1.8.0+dfsg

Sun Sep 27 11:15:48 BST 2020


Nilesh Patra pushed to branch upstream at Debian Med / python-wordcloud


Commits:
80d06ac2 by Nilesh Patra at 2020-09-27T15:14:09+05:30
New upstream version 1.8.0+dfsg
- - - - -


14 changed files:

- .circleci/config.yml
- .travis.yml
- + CONTRIBUTING.md
- README.md
- appveyor.yml
- requirements-dev.txt
- setup.cfg
- test/test_wordcloud.py
- test/test_wordcloud_cli.py
- versioneer.py
- wordcloud/_version.py
- wordcloud/tokenization.py
- wordcloud/wordcloud.py
- wordcloud/wordcloud_cli.py


Changes:

=====================================
.circleci/config.yml
=====================================
@@ -28,12 +28,12 @@ references:
 
   x64_build_job: &x64_build_job
     docker:
-      - image: dockcross/manylinux-x64
+      - image: dockcross/manylinux1-x64
     <<: *ci_steps
 
   x86_build_job: &x86_build_job
     docker:
-      - image: dockcross/manylinux-x86
+      - image: dockcross/manylinux1-x86
     <<: *ci_steps
 
   deploy_website_command: &deploy_website_command
@@ -55,28 +55,28 @@ jobs:
     <<: *x64_build_job
   manylinux-x64_cp27-cp27mu:
     <<: *x64_build_job
-  manylinux-x64_cp34-cp34m:
-    <<: *x64_build_job
   manylinux-x64_cp35-cp35m:
     <<: *x64_build_job
   manylinux-x64_cp36-cp36m_upload-sdist:
     <<: *x64_build_job
   manylinux-x64_cp37-cp37m:
     <<: *x64_build_job
+  manylinux-x64_cp38-cp38:
+    <<: *x64_build_job
 
   # x86
   #manylinux-x86_cp27-cp27m:
   #  <<: *x86_build_job
   #manylinux-x86_cp27-cp27mu:
   #  <<: *x86_build_job
-  #manylinux-x86_cp34-cp34m:
-  #  <<: *x86_build_job
   #manylinux-x86_cp35-cp35m:
   #  <<: *x86_build_job
   #manylinux-x86_cp36-cp36m:
   #  <<: *x86_build_job
   #manylinux-x86_cp37-cp37m:
   #  <<: *x86_build_job
+  #manylinux-x86_cp38-cp38:
+  #  <<: *x86_build_job
 
   build-website_cp37-cp37m:
     docker:
@@ -141,27 +141,27 @@ workflows:
           <<: *no_filters
       - manylinux-x64_cp27-cp27mu:
           <<: *no_filters
-      - manylinux-x64_cp34-cp34m:
-          <<: *no_filters
       - manylinux-x64_cp35-cp35m:
           <<: *no_filters
       - manylinux-x64_cp36-cp36m_upload-sdist:
           <<: *no_filters
       - manylinux-x64_cp37-cp37m:
           <<: *no_filters
+      - manylinux-x64_cp38-cp38:
+          <<: *no_filters
       # x86
       #- manylinux-x86_cp27-cp27m:
       #    <<: *no_filters
       #- manylinux-x86_cp27-cp27mu:
       #    <<: *no_filters
-      #- manylinux-x86_cp34-cp34m:
-      #    <<: *no_filters
       #- manylinux-x86_cp35-cp35m:
       #    <<: *no_filters
       #- manylinux-x86_cp36-cp36m:
       #    <<: *no_filters
       #- manylinux-x86_cp37-cp37m:
       #    <<: *no_filters
+      #- manylinux-x86_cp38-cp38:
+      #    <<: *no_filters
 
       - build-website_cp37-cp37m:
           requires:
@@ -173,17 +173,17 @@ workflows:
             # x64
             - manylinux-x64_cp27-cp27m
             - manylinux-x64_cp27-cp27mu
-            - manylinux-x64_cp34-cp34m
             - manylinux-x64_cp35-cp35m
             - manylinux-x64_cp36-cp36m_upload-sdist
             - manylinux-x64_cp37-cp37m
+            - manylinux-x64_cp38-cp38
             # x86
             #- manylinux-x86_cp27-cp27m
             #- manylinux-x86_cp27-cp27mu
-            #- manylinux-x86_cp34-cp34m
             #- manylinux-x86_cp35-cp35m
             #- manylinux-x86_cp36-cp36m
             #- manylinux-x86_cp37-cp37m
+            #- manylinux-x86_cp38-cp38
             # misc
             - build-website_cp37-cp37m
           filters:
@@ -194,17 +194,17 @@ workflows:
             # x64
             - manylinux-x64_cp27-cp27m
             - manylinux-x64_cp27-cp27mu
-            - manylinux-x64_cp34-cp34m
             - manylinux-x64_cp35-cp35m
             - manylinux-x64_cp36-cp36m_upload-sdist
             - manylinux-x64_cp37-cp37m
+            - manylinux-x64_cp38-cp38
             # x86
             #- manylinux-x86_cp27-cp27m
             #- manylinux-x86_cp27-cp27mu
-            #- manylinux-x86_cp34-cp34m
             #- manylinux-x86_cp35-cp35m
             #- manylinux-x86_cp36-cp36m
             #- manylinux-x86_cp37-cp37m
+            #- manylinux-x86_cp38-cp38
             # misc
             - build-website_cp37-cp37m
           filters:


=====================================
.travis.yml
=====================================
@@ -11,22 +11,22 @@ matrix:
     - os: osx
       language: generic
       env:
-        - PYTHON_VERSION=3.7.0
+        - PYTHON_VERSION=3.8.0
 
     - os: osx
       language: generic
       env:
-        - PYTHON_VERSION=3.6.5
+        - PYTHON_VERSION=3.7.0
 
     - os: osx
       language: generic
       env:
-        - PYTHON_VERSION=3.5.5
+        - PYTHON_VERSION=3.6.5
 
     - os: osx
       language: generic
       env:
-        - PYTHON_VERSION=3.4.8
+        - PYTHON_VERSION=3.5.5
 
     - os: osx
       language: generic
@@ -35,10 +35,10 @@ matrix:
 
 cache:
   directories:
+    - $HOME/.pyenv/versions/3.8.0
     - $HOME/.pyenv/versions/3.7.0
     - $HOME/.pyenv/versions/3.6.5
     - $HOME/.pyenv/versions/3.5.5
-    - $HOME/.pyenv/versions/3.4.8
     - $HOME/.pyenv/versions/2.7.15
     - $HOME/downloads
 


=====================================
CONTRIBUTING.md
=====================================
@@ -0,0 +1,46 @@
+# Contributing
+
+To contribute to wordcloud, you'll need to follow the instructions in
+[Creating a pull request from a fork](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork).
+
+In addition to the general procedure for creating a pull request, please follow
+the following steps:
+
+## Before starting development
+
+### Use a correct version of Python
+
+Python 3.7.x should be fine for development.
+
+```
+python --version
+> Python 3.7.6
+```
+
+### Install all dependencies
+
+```
+pip install -U -r requirements.txt -r requirements-dev.txt
+```
+
+### Ensure that files are correctly formatted
+
+```
+flake8
+```
+
+### Ensure that tests pass
+
+```
+pip install -e .
+pytest
+```
+
+## Before creating a pull request
+
+### Confirm formatting and test passage
+
+```
+flake8
+pytest
+```


=====================================
README.md
=====================================
@@ -57,6 +57,10 @@ Or run [examples/masked.py][masked] to see more options. A sample output is:
 Getting fancy with some colors:
 ![Parrot with rainbow colors](examples/parrot_new.png)
 
+Generating wordclouds for Arabic:
+
+![Arabic wordlcloud](examples/arabic_example.png)
+
 
 ## Command-line usage
 


=====================================
appveyor.yml
=====================================
@@ -8,56 +8,56 @@ version: "0.0.1.{build}"
 environment:
   matrix:
 
-#    - PYTHON_DIR: "C:\\Python27"
-#      PYTHON_VERSION: "2.7.x"
-#      PYTHON_ARCH: "32"
-#      BLOCK: "0"
+    - PYTHON_DIR: "C:\\Python27"
+      PYTHON_VERSION: "2.7.x"
+      PYTHON_ARCH: "32"
+      BLOCK: "0"
 
     - PYTHON_DIR: "C:\\Python27-x64"
       PYTHON_VERSION: "2.7.x"
       PYTHON_ARCH: "64"
       BLOCK: "0"
 
-#    - PYTHON_DIR: "C:\\Python34"
-#      PYTHON_VERSION: "3.4.x"
-#      PYTHON_ARCH: "32"
-#      BLOCK: "0"
-
-    - PYTHON_DIR: "C:\\Python34-x64"
-      PYTHON_VERSION: "3.4.x"
-      PYTHON_ARCH: "64"
+    - PYTHON_DIR: "C:\\Python35"
+      PYTHON_VERSION: "3.5.x"
+      PYTHON_ARCH: "32"
       BLOCK: "0"
 
-#    - PYTHON_DIR: "C:\\Python35"
-#      PYTHON_VERSION: "3.5.x"
-#      PYTHON_ARCH: "32"
-#      BLOCK: "0"
-
     - PYTHON_DIR: "C:\\Python35-x64"
       PYTHON_VERSION: "3.5.x"
       PYTHON_ARCH: "64"
       BLOCK: "0"
 
-#    - PYTHON_DIR: "C:\\Python36"
-#      PYTHON_VERSION: "3.6.x"
-#      PYTHON_ARCH: "32"
-#      BLOCK: "0"
+    - PYTHON_DIR: "C:\\Python36"
+      PYTHON_VERSION: "3.6.x"
+      PYTHON_ARCH: "32"
+      BLOCK: "0"
 
     - PYTHON_DIR: "C:\\Python36-x64"
       PYTHON_VERSION: "3.6.x"
       PYTHON_ARCH: "64"
       BLOCK: "0"
 
-#    - PYTHON_DIR: "C:\\Python37"
-#      PYTHON_VERSION: "3.7.x"
-#      PYTHON_ARCH: "32"
-#      BLOCK: "0"
-#
+    - PYTHON_DIR: "C:\\Python37"
+      PYTHON_VERSION: "3.7.x"
+      PYTHON_ARCH: "32"
+      BLOCK: "0"
+
     - PYTHON_DIR: "C:\\Python37-x64"
       PYTHON_VERSION: "3.7.x"
       PYTHON_ARCH: "64"
       BLOCK: "0"
 
+    - PYTHON_DIR: "C:\\Python38"
+      PYTHON_VERSION: "3.8.x"
+      PYTHON_ARCH: "32"
+      BLOCK: "0"
+
+    - PYTHON_DIR: "C:\\Python38-x64"
+      PYTHON_VERSION: "3.8.x"
+      PYTHON_ARCH: "64"
+      BLOCK: "0"
+
   PYPI_USER:
     secure: deKM8MvS0hJbOBqZmBym0Q==
   PYPI_PASSWORD:


=====================================
requirements-dev.txt
=====================================
@@ -1,10 +1,10 @@
 codecov
 coverage
-flake8
+flake8==3.8.0
 mock
-pytest
+pytest<5.4.0
 pytest-cov
 pytest-sugar
 setuptools>=28.0.0
 twine
-wheel==0.31.1
+wheel==0.34.1


=====================================
setup.cfg
=====================================
@@ -17,7 +17,7 @@ show-source = True
 # Maximum cyclomatic complexity allowed
 max-complexity = 14
 format = pylint
-exclude = .git,.idea,.eggs,__pycache__,doc/_build,doc/auto_examples,doc/conf.py,build,wordcloud/_version.py,versioneer.py
+exclude = .git,.idea,.eggs,__pycache__,dist,doc/_build,doc/auto_examples,doc/conf.py,build,wordcloud/_version.py,versioneer.py
 
 [tool:pytest]
 addopts = -v --cov --cov-report xml --tb=short


=====================================
test/test_wordcloud.py
=====================================
@@ -6,6 +6,7 @@ import pytest
 from random import Random
 from numpy.testing import assert_array_equal
 from PIL import Image
+import xml.etree.ElementTree as ET
 
 import matplotlib
 matplotlib.use('Agg')
@@ -40,12 +41,34 @@ Namespaces are one honking great idea -- let's do more of those!
     46 09 55 05 82   23 17 25 35 94   08 128
 """
 
+STOPWORDED_COLLOCATIONS = """
+thank you very much
+thank you very much
+thank you very much
+thanks
+"""
+
+STOPWORDED_COLLOCATIONS_UPPERCASE = """
+Thank you very much
+Thank you very much
+Thank you very much
+thank you very much
+hi There
+Hi there
+Hi There
+thanks
+"""
+
+SMALL_CANVAS = """
+better late than never someone will say
+"""
+
 
 def test_collocations():
-    wc = WordCloud(collocations=False, stopwords=[])
+    wc = WordCloud(collocations=False, stopwords=set())
     wc.generate(THIS)
 
-    wc2 = WordCloud(collocations=True, stopwords=[])
+    wc2 = WordCloud(collocations=True, stopwords=set())
     wc2.generate(THIS)
 
     assert "is better" in wc2.words_
@@ -53,6 +76,30 @@ def test_collocations():
     assert "way may" not in wc2.words_
 
 
+def test_collocation_stopwords():
+    wc = WordCloud(collocations=True, stopwords={"you", "very"}, collocation_threshold=9)
+    wc.generate(STOPWORDED_COLLOCATIONS)
+
+    assert "thank you" not in wc.words_
+    assert "very much" not in wc.words_
+    assert "thank" in wc.words_
+    # a bigram of all stopwords will be removed
+    assert "you very" not in wc.words_
+
+
+def test_collocation_stopwords_uppercase():
+    wc = WordCloud(collocations=True, stopwords={"thank", "hi", "there"}, collocation_threshold=9)
+    wc.generate(STOPWORDED_COLLOCATIONS_UPPERCASE)
+
+    assert "Thank you" not in wc.words_
+    assert "thank you" not in wc.words_
+    assert "Thank" not in wc.words_
+    # a bigram of all stopwords will be removed
+    assert "hi There" not in wc.words_
+    assert "Hi there" not in wc.words_
+    assert "Hi There" not in wc.words_
+
+
 def test_plurals_numbers():
     text = THIS + "\n" + "1 idea 2 ideas three ideas although many Ideas"
     wc = WordCloud(stopwords=[]).generate(text)
@@ -154,6 +201,13 @@ def test_check_errors():
         assert "call generate" in str(e)
 
 
+def test_svg_syntax():
+    wc = WordCloud()
+    wc.generate(THIS)
+    svg = wc.to_svg()
+    ET.fromstring(svg)
+
+
 def test_recolor():
     wc = WordCloud(max_words=50, colormap="jet")
     wc.generate(THIS)
@@ -343,7 +397,9 @@ def test_recolor_too_small_set_default():
 
 def test_small_canvas():
     # check font size fallback works on small canvas
-    WordCloud(max_words=50, width=20, height=20).generate(THIS)
+    wc = WordCloud(max_words=50, width=21, height=21)
+    wc.generate(SMALL_CANVAS)
+    assert len(wc.layout_) > 0
 
 
 def test_tiny_canvas():
@@ -351,6 +407,7 @@ def test_tiny_canvas():
     w = WordCloud(max_words=50, width=1, height=1)
     with pytest.raises(ValueError, match="Couldn't find space to draw"):
         w.generate(THIS)
+    assert len(w.layout_) == 0
 
 
 def test_coloring_black_works():
@@ -396,3 +453,15 @@ def test_zero_frequencies():
     word_cloud.generate_from_frequencies({'test': 1, 'test1': 0, 'test2': 0})
     assert len(word_cloud.layout_) == 1
     assert word_cloud.layout_[0][0][0] == 'test'
+
+
+def test_plural_stopwords():
+    x = '''was was was was was was was was was was was was was was was
+    wa
+    hello hello hello hello hello hello hello hello
+    goodbye good bye maybe yes no'''
+    w = WordCloud().generate(x)
+    assert w.words_['wa'] < 1
+
+    w = WordCloud(collocations=False).generate(x)
+    assert w.words_['wa'] < 1


=====================================
test/test_wordcloud_cli.py
=====================================
@@ -1,6 +1,7 @@
 import argparse
 import os
 import subprocess
+import sys
 from collections import namedtuple
 
 import wordcloud as wc
@@ -25,7 +26,10 @@ ARGUMENT_SPEC_TYPED = [
     ArgOption(cli_name='relative_scaling', init_name='relative_scaling', pass_value=1, fail_value='c'),
 ]
 ARGUMENT_SPEC_UNARY = [
-    ArgOption(cli_name='no_collocations', init_name='collocations', pass_value=True, fail_value=1)
+    ArgOption(cli_name='no_collocations', init_name='collocations', pass_value=True, fail_value=1),
+    ArgOption(cli_name='include_numbers', init_name='include_numbers', pass_value=True, fail_value=2),
+    ArgOption(cli_name='no_normalize_plurals', init_name='normalize_plurals', pass_value=True, fail_value=3),
+    ArgOption(cli_name='repeat', init_name='repeat', pass_value=True, fail_value=4),
 ]
 ARGUMENT_SPEC_REMAINING = [
     ArgOption(cli_name='stopwords', init_name='stopwords', pass_value=PassFile(), fail_value=None),
@@ -36,8 +40,16 @@ ARGUMENT_SPEC_REMAINING = [
     ArgOption(cli_name='background', init_name='background_color', pass_value='grey', fail_value=None),
     ArgOption(cli_name='contour_color', init_name='contour_color', pass_value='grey', fail_value=None),
     ArgOption(cli_name='contour_width', init_name='contour_width', pass_value=0.5, fail_value='blue'),
-    ArgOption(cli_name='include_numbers', init_name='include_numbers', pass_value=True, fail_value=None),
     ArgOption(cli_name='min_word_length', init_name='min_word_length', pass_value=5, fail_value='blue'),
+    ArgOption(cli_name='prefer_horizontal', init_name='prefer_horizontal', pass_value=.1, fail_value='blue'),
+    ArgOption(cli_name='scale', init_name='scale', pass_value=1., fail_value='blue'),
+    ArgOption(cli_name='colormap', init_name='colormap', pass_value='Greens', fail_value=1),
+    ArgOption(cli_name='mode', init_name='mode', pass_value='RGBA', fail_value=2),
+    ArgOption(cli_name='max_words', init_name='max_words', pass_value=10, fail_value='blue'),
+    ArgOption(cli_name='min_font_size', init_name='min_font_size', pass_value=10, fail_value='blue'),
+    ArgOption(cli_name='max_font_size', init_name='max_font_size', pass_value=10, fail_value='blue'),
+    ArgOption(cli_name='font_step', init_name='font_step', pass_value=10, fail_value='blue'),
+    ArgOption(cli_name='random_state', init_name='random_state', pass_value=100, fail_value='blue'),
 ]
 ARGUMENT_CLI_NAMES_UNARY = [arg_opt.cli_name for arg_opt in ARGUMENT_SPEC_UNARY]
 
@@ -129,7 +141,7 @@ def test_unicode_with_stopwords():
     assert u'\u304D' in args['stopwords']
 
 
-def test_cli_writes_image(tmpdir, tmp_text_file):
+def test_cli_writes_to_imagefile(tmpdir, tmp_text_file):
     # ensure writing works with all python versions
     tmp_image_file = tmpdir.join("word_cloud.png")
 
@@ -138,7 +150,26 @@ def test_cli_writes_image(tmpdir, tmp_text_file):
     args, text, image_file = cli.parse_args(['--text', str(tmp_text_file), '--imagefile', str(tmp_image_file)])
     cli.main(args, text, image_file)
 
-    # expecting image to be written
+    # expecting image to be written to imagefile
+    assert tmp_image_file.size() > 0
+
+
+# capsysbinary should be used here, but it's not supported in python 2.
+def test_cli_writes_to_stdout(tmpdir, tmp_text_file):
+    # ensure writing works with all python versions
+    tmp_image_file = tmpdir.join("word_cloud.png")
+
+    tmp_text_file.write(b'some text')
+
+    originalBuffer = sys.stdout.buffer
+    sys.stdout.buffer = tmp_image_file.open('wb+')
+
+    args, text, image_file = cli.parse_args(['--text', str(tmp_text_file)])
+    cli.main(args, text, image_file)
+
+    sys.stdout.buffer = originalBuffer
+
+    # expecting image to be written to stdout
     assert tmp_image_file.size() > 0
 
 


=====================================
versioneer.py
=====================================
@@ -418,7 +418,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
     return stdout, p.returncode
 
 
-LONG_VERSION_PY['git'] = '''
+LONG_VERSION_PY['git'] = r'''
 # This file helps to compute a version number in source trees obtained from
 # git-archive tarball (such as those provided by githubs download-from-tag
 # feature). Distribution tarballs (built by setup.py sdist) and build


=====================================
wordcloud/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
     # setup.py/versioneer.py will grep for the variable names, so they must
     # each be defined on a line of their own. _version.py will just call
     # get_keywords().
-    git_refnames = " (tag: 1.6.0)"
-    git_full = "378a920fd382e38101739331a6db4857c5f1a10b"
-    git_date = "2019-11-22 17:04:28 -0500"
+    git_refnames = " (tag: 1.8.0)"
+    git_full = "2280d32872720a4e107de3e78ee39b1e172dc242"
+    git_date = "2020-08-14 17:41:09 -0400"
     keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
     return keywords
 


=====================================
wordcloud/tokenization.py
=====================================
@@ -5,7 +5,7 @@ from collections import defaultdict
 from math import log
 
 
-def l(k, n, x):  # noqa: E743
+def l(k, n, x):  # noqa: E741, E743
     # dunning's likelihood ratio with notation from
     # http://nlp.stanford.edu/fsnlp/promo/colloc.pdf p162
     return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)
@@ -36,26 +36,29 @@ def pairwise(iterable):
     return zip(a, b)
 
 
-def unigrams_and_bigrams(words, normalize_plurals=True):
-    n_words = len(words)
-    # make tuples of two words following each other
-    bigrams = list(pairwise(words))
+def unigrams_and_bigrams(words, stopwords, normalize_plurals=True, collocation_threshold=30):
+    # We must create the bigrams before removing the stopword tokens from the words, or else we get bigrams like
+    # "thank much" from "thank you very much".
+    # We don't allow any of the words in the bigram to be stopwords
+    bigrams = list(p for p in pairwise(words) if not any(w.lower() in stopwords for w in p))
+    unigrams = list(w for w in words if w.lower() not in stopwords)
+    n_words = len(unigrams)
     counts_unigrams, standard_form = process_tokens(
-        words, normalize_plurals=normalize_plurals)
+        unigrams, normalize_plurals=normalize_plurals)
     counts_bigrams, standard_form_bigrams = process_tokens(
         [" ".join(bigram) for bigram in bigrams],
         normalize_plurals=normalize_plurals)
     # create a copy of counts_unigram so the score computation is not changed
-    counts = counts_unigrams.copy()
+    orig_counts = counts_unigrams.copy()
 
-    # decount words inside bigrams
+    # Include bigrams that are also collocations
     for bigram_string, count in counts_bigrams.items():
         bigram = tuple(bigram_string.split(" "))
-        # collocation detection (30 is arbitrary):
         word1 = standard_form[bigram[0].lower()]
         word2 = standard_form[bigram[1].lower()]
 
-        if score(count, counts[word1], counts[word2], n_words) > 30:
+        collocation_score = score(count, orig_counts[word1], orig_counts[word2], n_words)
+        if collocation_score > collocation_threshold:
             # bigram is a collocation
             # discount words in unigrams dict. hack because one word might
             # appear in multiple collocations at the same time
@@ -63,10 +66,8 @@ def unigrams_and_bigrams(words, normalize_plurals=True):
             counts_unigrams[word1] -= counts_bigrams[bigram_string]
             counts_unigrams[word2] -= counts_bigrams[bigram_string]
             counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
-    words = list(counts_unigrams.keys())
-    for word in words:
-        # remove empty / negative counts
-        if counts_unigrams[word] <= 0:
+    for word, count in list(counts_unigrams.items()):
+        if count <= 0:
             del counts_unigrams[word]
     return counts_unigrams
 


=====================================
wordcloud/wordcloud.py
=====================================
@@ -1,3 +1,4 @@
+# coding=utf-8
 # Author: Andreas Christian Mueller <t3kcit at gmail.com>
 #
 # (c) 2012
@@ -9,13 +10,16 @@ from __future__ import division
 
 import warnings
 from random import Random
+import io
 import os
 import re
+import base64
 import sys
 import colorsys
 import matplotlib
 import numpy as np
 from operator import itemgetter
+from xml.sax import saxutils
 
 from PIL import Image
 from PIL import ImageColor
@@ -270,6 +274,14 @@ class WordCloud(object):
     min_word_length : int, default=0
         Minimum number of letters a word must have to be included.
 
+    collocation_threshold: int, default=30
+        Bigrams must have a Dunning likelihood collocation score greater than this
+        parameter to be counted as bigrams. Default of 30 is arbitrary.
+
+        See Manning, C.D., Manning, C.D. and Schütze, H., 1999. Foundations of
+        Statistical Natural Language Processing. MIT press, p. 162
+        https://nlp.stanford.edu/fsnlp/promo/colloc.pdf#page=22
+
     Attributes
     ----------
     ``words_`` : dict of string to float
@@ -300,7 +312,7 @@ class WordCloud(object):
                  relative_scaling='auto', regexp=None, collocations=True,
                  colormap=None, normalize_plurals=True, contour_width=0,
                  contour_color='black', repeat=False,
-                 include_numbers=False, min_word_length=0):
+                 include_numbers=False, min_word_length=0, collocation_threshold=30):
         if font_path is None:
             font_path = FONT_PATH
         if color_func is None and colormap is None:
@@ -351,6 +363,7 @@ class WordCloud(object):
         self.repeat = repeat
         self.include_numbers = include_numbers
         self.min_word_length = min_word_length
+        self.collocation_threshold = collocation_threshold
 
     def fit_words(self, frequencies):
         """Create a word_cloud from words and frequencies.
@@ -554,15 +567,11 @@ class WordCloud(object):
         include all those things.
         """
 
-        stopwords = set([i.lower() for i in self.stopwords])
-
         flags = (re.UNICODE if sys.version < '3' and type(text) is unicode  # noqa: F821
                  else 0)
         regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
 
         words = re.findall(regexp, text, flags)
-        # remove stopwords
-        words = [word for word in words if word.lower() not in stopwords]
         # remove 's
         words = [word[:-2] if word.lower().endswith("'s") else word
                  for word in words]
@@ -573,9 +582,12 @@ class WordCloud(object):
         if self.min_word_length:
             words = [word for word in words if len(word) >= self.min_word_length]
 
+        stopwords = set([i.lower() for i in self.stopwords])
         if self.collocations:
-            word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
+            word_counts = unigrams_and_bigrams(words, stopwords, self.normalize_plurals, self.collocation_threshold)
         else:
+            # remove stopwords
+            words = [word for word in words if word.lower() not in stopwords]
             word_counts, _ = process_tokens(words, self.normalize_plurals)
 
         return word_counts
@@ -729,6 +741,254 @@ class WordCloud(object):
     def to_html(self):
         raise NotImplementedError("FIXME!!!")
 
+    def to_svg(self, embed_font=False, optimize_embedded_font=True, embed_image=False):
+        """Export to SVG.
+
+        Font is assumed to be available to the SVG reader. Otherwise, text
+        coordinates may produce artifacts when rendered with replacement font.
+        It is also possible to include a subset of the original font in WOFF
+        format using ``embed_font`` (requires `fontTools`).
+
+        Note that some renderers do not handle glyphs the same way, and may
+        differ from ``to_image`` result. In particular, Complex Text Layout may
+        not be supported. In this typesetting, the shape or positioning of a
+        grapheme depends on its relation to other graphemes.
+
+        Pillow, since version 4.2.0, supports CTL using ``libraqm``. However,
+        due to dependencies, this feature is not always enabled. Hence, the
+        same rendering differences may appear in ``to_image``. As this
+        rasterized output is used to compute the layout, this also affects the
+        layout generation. Use ``PIL.features.check`` to test availability of
+        ``raqm``.
+
+        Consistant rendering is therefore expected if both Pillow and the SVG
+        renderer have the same support of CTL.
+
+        Contour drawing is not supported.
+
+        Parameters
+        ----------
+        embed_font : bool, default=False
+            Whether to include font inside resulting SVG file.
+
+        optimize_embedded_font : bool, default=True
+            Whether to be aggressive when embedding a font, to reduce size. In
+            particular, hinting tables are dropped, which may introduce slight
+            changes to character shapes (w.r.t. `to_image` baseline).
+
+        embed_image : bool, default=False
+            Whether to include rasterized image inside resulting SVG file.
+            Useful for debugging.
+
+        Returns
+        -------
+        content : string
+            Word cloud image as SVG string
+        """
+
+        # TODO should add option to specify URL for font (i.e. WOFF file)
+
+        # Make sure layout is generated
+        self._check_generated()
+
+        # Get output size, in pixels
+        if self.mask is not None:
+            width = self.mask.shape[1]
+            height = self.mask.shape[0]
+        else:
+            height, width = self.height, self.width
+
+        # Get max font size
+        if self.max_font_size is None:
+            max_font_size = max(w[1] for w in self.layout_)
+        else:
+            max_font_size = self.max_font_size
+
+        # Text buffer
+        result = []
+
+        # Get font information
+        font = ImageFont.truetype(self.font_path, int(max_font_size * self.scale))
+        raw_font_family, raw_font_style = font.getname()
+        # TODO properly escape/quote this name?
+        font_family = repr(raw_font_family)
+        # TODO better support for uncommon font styles/weights?
+        raw_font_style = raw_font_style.lower()
+        if 'bold' in raw_font_style:
+            font_weight = 'bold'
+        else:
+            font_weight = 'normal'
+        if 'italic' in raw_font_style:
+            font_style = 'italic'
+        elif 'oblique' in raw_font_style:
+            font_style = 'oblique'
+        else:
+            font_style = 'normal'
+
+        # Add header
+        result.append(
+            '<svg'
+            ' xmlns="http://www.w3.org/2000/svg"'
+            ' width="{}"'
+            ' height="{}"'
+            '>'
+            .format(
+                width * self.scale,
+                height * self.scale
+            )
+        )
+
+        # Embed font, if requested
+        if embed_font:
+
+            # Import here, to avoid hard dependency on fonttools
+            import fontTools
+            import fontTools.subset
+
+            # Subset options
+            options = fontTools.subset.Options(
+
+                # Small impact on character shapes, but reduce size a lot
+                hinting=not optimize_embedded_font,
+
+                # On small subsets, can improve size
+                desubroutinize=optimize_embedded_font,
+
+                # Try to be lenient
+                ignore_missing_glyphs=True,
+            )
+
+            # Load and subset font
+            ttf = fontTools.subset.load_font(self.font_path, options)
+            subsetter = fontTools.subset.Subsetter(options)
+            characters = {c for item in self.layout_ for c in item[0][0]}
+            text = ''.join(characters)
+            subsetter.populate(text=text)
+            subsetter.subset(ttf)
+
+            # Export as WOFF
+            # TODO is there a better method, i.e. directly export to WOFF?
+            buffer = io.BytesIO()
+            ttf.saveXML(buffer)
+            buffer.seek(0)
+            woff = fontTools.ttLib.TTFont(flavor='woff')
+            woff.importXML(buffer)
+
+            # Create stylesheet with embedded font face
+            buffer = io.BytesIO()
+            woff.save(buffer)
+            data = base64.b64encode(buffer.getbuffer()).decode('ascii')
+            url = 'data:application/font-woff;charset=utf-8;base64,' + data
+            result.append(
+                '<style>'
+                '@font-face{{'
+                'font-family:{};'
+                'font-weight:{};'
+                'font-style:{};'
+                'src:url("{}")format("woff");'
+                '}}'
+                '</style>'
+                .format(
+                    font_family,
+                    font_weight,
+                    font_style,
+                    url
+                )
+            )
+
+        # Select global style
+        result.append(
+            '<style>'
+            'text{{'
+            'font-family:{};'
+            'font-weight:{};'
+            'font-style:{};'
+            '}}'
+            '</style>'
+            .format(
+                font_family,
+                font_weight,
+                font_style
+            )
+        )
+
+        # Add background
+        if self.background_color is not None:
+            result.append(
+                '<rect'
+                ' width="100%"'
+                ' height="100%"'
+                ' style="fill:{}"'
+                '>'
+                '</rect>'
+                .format(self.background_color)
+            )
+
+        # Embed image, useful for debug purpose
+        if embed_image:
+            image = self.to_image()
+            data = io.BytesIO()
+            image.save(data, format='JPEG')
+            data = base64.b64encode(data.getbuffer()).decode('ascii')
+            result.append(
+                '<image'
+                ' width="100%"'
+                ' height="100%"'
+                ' href="data:image/jpg;base64,{}"'
+                '/>'
+                .format(data)
+            )
+
+        # For each word in layout
+        for (word, count), font_size, (y, x), orientation, color in self.layout_:
+            x *= self.scale
+            y *= self.scale
+
+            # Get text metrics
+            font = ImageFont.truetype(self.font_path, int(font_size * self.scale))
+            (size_x, size_y), (offset_x, offset_y) = font.font.getsize(word)
+            ascent, descent = font.getmetrics()
+
+            # Compute text bounding box
+            min_x = -offset_x
+            max_x = size_x - offset_x
+            max_y = ascent - offset_y
+
+            # Compute text attributes
+            attributes = {}
+            if orientation == Image.ROTATE_90:
+                x += max_y
+                y += max_x - min_x
+                transform = 'translate({},{}) rotate(-90)'.format(x, y)
+            else:
+                x += min_x
+                y += max_y
+                transform = 'translate({},{})'.format(x, y)
+
+            # Create node
+            attributes = ' '.join('{}="{}"'.format(k, v) for k, v in attributes.items())
+            result.append(
+                '<text'
+                ' transform="{}"'
+                ' font-size="{}"'
+                ' style="fill:{}"'
+                '>'
+                '{}'
+                '</text>'
+                .format(
+                    transform,
+                    font_size * self.scale,
+                    color,
+                    saxutils.escape(word)
+                )
+            )
+
+        # TODO draw contour
+
+        # Complete SVG file
+        result.append('</svg>')
+        return '\n'.join(result)
+
     def _get_bolean_mask(self, mask):
         """Cast to two dimensional boolean mask."""
         if mask.dtype.kind == 'f':


=====================================
wordcloud/wordcloud_cli.py
=====================================
@@ -54,14 +54,15 @@ class FileType(object):
             if 'r' in self._mode:
                 return sys.stdin
             elif 'w' in self._mode:
-                return sys.stdout
+                return sys.stdout.buffer if 'b' in self._mode else sys.stdout
             else:
                 msg = 'argument "-" with mode %r' % self._mode
                 raise ValueError(msg)
 
         # all other arguments are used as file names
         try:
-            return io.open(string, self._mode, self._bufsize, encoding="UTF-8")
+            encoding = None if 'b' in self._mode else "UTF-8"
+            return io.open(string, self._mode, self._bufsize, encoding=encoding)
         except IOError as e:
             message = "can't open '%s': %s"
             raise argparse.ArgumentTypeError(message % (string, e))
@@ -107,7 +108,7 @@ def make_parser():
         help='specify file of stopwords (containing one word per line)'
              ' to remove from the given text after parsing')
     parser.add_argument(
-        '--imagefile', metavar='file', type=argparse.FileType('wb'),
+        '--imagefile', metavar='file', type=FileType('wb'),
         default='-',
         help='file the completed PNG image should be written to'
              ' (default: stdout)')
@@ -156,9 +157,7 @@ def make_parser():
              '(default: add unigrams and bigrams)')
     parser.add_argument(
         '--include_numbers',
-        type=bool,
-        default=False,
-        metavar='include_numbers',
+        action='store_true',
         dest='include_numbers',
         help='include numbers in wordcloud?')
     parser.add_argument(
@@ -168,6 +167,52 @@ def make_parser():
         metavar='min_word_length',
         dest='min_word_length',
         help='only include words with more than X letters')
+    parser.add_argument(
+        '--prefer_horizontal',
+        type=float, default=.9, metavar='ratio',
+        help='ratio of times to try horizontal fitting as opposed to vertical')
+    parser.add_argument(
+        '--scale',
+        type=float, default=1, metavar='scale',
+        help='scaling between computation and drawing')
+    parser.add_argument(
+        '--colormap',
+        type=str, default='viridis', metavar='map',
+        help='matplotlib colormap name')
+    parser.add_argument(
+        '--mode',
+        type=str, default='RGB', metavar='mode',
+        help='use RGB or RGBA for transparent background')
+    parser.add_argument(
+        '--max_words',
+        type=int, default=200, metavar='N',
+        help='maximum number of words')
+    parser.add_argument(
+        '--min_font_size',
+        type=int, default=4, metavar='size',
+        help='smallest font size to use')
+    parser.add_argument(
+        '--max_font_size',
+        type=int, default=None, metavar='size',
+        help='maximum font size for the largest word')
+    parser.add_argument(
+        '--font_step',
+        type=int, default=1, metavar='step',
+        help='step size for the font')
+    parser.add_argument(
+        '--random_state',
+        type=int, default=None, metavar='seed',
+        help='random seed')
+    parser.add_argument(
+        '--no_normalize_plurals',
+        action='store_false',
+        dest='normalize_plurals',
+        help='whether to remove trailing \'s\' from words')
+    parser.add_argument(
+        '--repeat',
+        action='store_true',
+        dest='repeat',
+        help='whether to repeat words and phrases')
     parser.add_argument(
         '--version', action='version',
         version='%(prog)s {version}'.format(version=__version__))



View it on GitLab: https://salsa.debian.org/med-team/python-wordcloud/-/commit/80d06ac24cc1235b9964c3df138e17008765d5f3

-- 
View it on GitLab: https://salsa.debian.org/med-team/python-wordcloud/-/commit/80d06ac24cc1235b9964c3df138e17008765d5f3
You're receiving this email because of your account on salsa.debian.org.


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200927/7a9362ba/attachment-0001.html>