[med-svn] [Git][med-team/python-wordcloud][upstream] New upstream version 1.8.0+dfsg
Nilesh Patra
gitlab at salsa.debian.org
Sun Sep 27 11:15:48 BST 2020
Nilesh Patra pushed to branch upstream at Debian Med / python-wordcloud
Commits:
80d06ac2 by Nilesh Patra at 2020-09-27T15:14:09+05:30
New upstream version 1.8.0+dfsg
- - - - -
14 changed files:
- .circleci/config.yml
- .travis.yml
- + CONTRIBUTING.md
- README.md
- appveyor.yml
- requirements-dev.txt
- setup.cfg
- test/test_wordcloud.py
- test/test_wordcloud_cli.py
- versioneer.py
- wordcloud/_version.py
- wordcloud/tokenization.py
- wordcloud/wordcloud.py
- wordcloud/wordcloud_cli.py
Changes:
=====================================
.circleci/config.yml
=====================================
@@ -28,12 +28,12 @@ references:
x64_build_job: &x64_build_job
docker:
- - image: dockcross/manylinux-x64
+ - image: dockcross/manylinux1-x64
<<: *ci_steps
x86_build_job: &x86_build_job
docker:
- - image: dockcross/manylinux-x86
+ - image: dockcross/manylinux1-x86
<<: *ci_steps
deploy_website_command: &deploy_website_command
@@ -55,28 +55,28 @@ jobs:
<<: *x64_build_job
manylinux-x64_cp27-cp27mu:
<<: *x64_build_job
- manylinux-x64_cp34-cp34m:
- <<: *x64_build_job
manylinux-x64_cp35-cp35m:
<<: *x64_build_job
manylinux-x64_cp36-cp36m_upload-sdist:
<<: *x64_build_job
manylinux-x64_cp37-cp37m:
<<: *x64_build_job
+ manylinux-x64_cp38-cp38:
+ <<: *x64_build_job
# x86
#manylinux-x86_cp27-cp27m:
# <<: *x86_build_job
#manylinux-x86_cp27-cp27mu:
# <<: *x86_build_job
- #manylinux-x86_cp34-cp34m:
- # <<: *x86_build_job
#manylinux-x86_cp35-cp35m:
# <<: *x86_build_job
#manylinux-x86_cp36-cp36m:
# <<: *x86_build_job
#manylinux-x86_cp37-cp37m:
# <<: *x86_build_job
+ #manylinux-x86_cp38-cp38:
+ # <<: *x86_build_job
build-website_cp37-cp37m:
docker:
@@ -141,27 +141,27 @@ workflows:
<<: *no_filters
- manylinux-x64_cp27-cp27mu:
<<: *no_filters
- - manylinux-x64_cp34-cp34m:
- <<: *no_filters
- manylinux-x64_cp35-cp35m:
<<: *no_filters
- manylinux-x64_cp36-cp36m_upload-sdist:
<<: *no_filters
- manylinux-x64_cp37-cp37m:
<<: *no_filters
+ - manylinux-x64_cp38-cp38:
+ <<: *no_filters
# x86
#- manylinux-x86_cp27-cp27m:
# <<: *no_filters
#- manylinux-x86_cp27-cp27mu:
# <<: *no_filters
- #- manylinux-x86_cp34-cp34m:
- # <<: *no_filters
#- manylinux-x86_cp35-cp35m:
# <<: *no_filters
#- manylinux-x86_cp36-cp36m:
# <<: *no_filters
#- manylinux-x86_cp37-cp37m:
# <<: *no_filters
+ #- manylinux-x86_cp38-cp38:
+ # <<: *no_filters
- build-website_cp37-cp37m:
requires:
@@ -173,17 +173,17 @@ workflows:
# x64
- manylinux-x64_cp27-cp27m
- manylinux-x64_cp27-cp27mu
- - manylinux-x64_cp34-cp34m
- manylinux-x64_cp35-cp35m
- manylinux-x64_cp36-cp36m_upload-sdist
- manylinux-x64_cp37-cp37m
+ - manylinux-x64_cp38-cp38
# x86
#- manylinux-x86_cp27-cp27m
#- manylinux-x86_cp27-cp27mu
- #- manylinux-x86_cp34-cp34m
#- manylinux-x86_cp35-cp35m
#- manylinux-x86_cp36-cp36m
#- manylinux-x86_cp37-cp37m
+ #- manylinux-x86_cp38-cp38
# misc
- build-website_cp37-cp37m
filters:
@@ -194,17 +194,17 @@ workflows:
# x64
- manylinux-x64_cp27-cp27m
- manylinux-x64_cp27-cp27mu
- - manylinux-x64_cp34-cp34m
- manylinux-x64_cp35-cp35m
- manylinux-x64_cp36-cp36m_upload-sdist
- manylinux-x64_cp37-cp37m
+ - manylinux-x64_cp38-cp38
# x86
#- manylinux-x86_cp27-cp27m
#- manylinux-x86_cp27-cp27mu
- #- manylinux-x86_cp34-cp34m
#- manylinux-x86_cp35-cp35m
#- manylinux-x86_cp36-cp36m
#- manylinux-x86_cp37-cp37m
+ #- manylinux-x86_cp38-cp38
# misc
- build-website_cp37-cp37m
filters:
=====================================
.travis.yml
=====================================
@@ -11,22 +11,22 @@ matrix:
- os: osx
language: generic
env:
- - PYTHON_VERSION=3.7.0
+ - PYTHON_VERSION=3.8.0
- os: osx
language: generic
env:
- - PYTHON_VERSION=3.6.5
+ - PYTHON_VERSION=3.7.0
- os: osx
language: generic
env:
- - PYTHON_VERSION=3.5.5
+ - PYTHON_VERSION=3.6.5
- os: osx
language: generic
env:
- - PYTHON_VERSION=3.4.8
+ - PYTHON_VERSION=3.5.5
- os: osx
language: generic
@@ -35,10 +35,10 @@ matrix:
cache:
directories:
+ - $HOME/.pyenv/versions/3.8.0
- $HOME/.pyenv/versions/3.7.0
- $HOME/.pyenv/versions/3.6.5
- $HOME/.pyenv/versions/3.5.5
- - $HOME/.pyenv/versions/3.4.8
- $HOME/.pyenv/versions/2.7.15
- $HOME/downloads
=====================================
CONTRIBUTING.md
=====================================
@@ -0,0 +1,46 @@
+# Contributing
+
+To contribute to wordcloud, you'll need to follow the instructions in
+[Creating a pull request from a fork](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request-from-a-fork).
+
+In addition to the general procedure for creating a pull request, please follow
+the following steps:
+
+## Before starting development
+
+### Use a correct version of Python
+
+Python 3.7.x should be fine for development.
+
+```
+python --version
+> Python 3.7.6
+```
+
+### Install all dependencies
+
+```
+pip install -U -r requirements.txt -r requirements-dev.txt
+```
+
+### Ensure that files are correctly formatted
+
+```
+flake8
+```
+
+### Ensure that tests pass
+
+```
+pip install -e .
+pytest
+```
+
+## Before creating a pull request
+
+### Confirm formatting and test passage
+
+```
+flake8
+pytest
+```
=====================================
README.md
=====================================
@@ -57,6 +57,10 @@ Or run [examples/masked.py][masked] to see more options. A sample output is:
Getting fancy with some colors:
![Parrot with rainbow colors](examples/parrot_new.png)
+Generating wordclouds for Arabic:
+
+![Arabic wordlcloud](examples/arabic_example.png)
+
## Command-line usage
=====================================
appveyor.yml
=====================================
@@ -8,56 +8,56 @@ version: "0.0.1.{build}"
environment:
matrix:
-# - PYTHON_DIR: "C:\\Python27"
-# PYTHON_VERSION: "2.7.x"
-# PYTHON_ARCH: "32"
-# BLOCK: "0"
+ - PYTHON_DIR: "C:\\Python27"
+ PYTHON_VERSION: "2.7.x"
+ PYTHON_ARCH: "32"
+ BLOCK: "0"
- PYTHON_DIR: "C:\\Python27-x64"
PYTHON_VERSION: "2.7.x"
PYTHON_ARCH: "64"
BLOCK: "0"
-# - PYTHON_DIR: "C:\\Python34"
-# PYTHON_VERSION: "3.4.x"
-# PYTHON_ARCH: "32"
-# BLOCK: "0"
-
- - PYTHON_DIR: "C:\\Python34-x64"
- PYTHON_VERSION: "3.4.x"
- PYTHON_ARCH: "64"
+ - PYTHON_DIR: "C:\\Python35"
+ PYTHON_VERSION: "3.5.x"
+ PYTHON_ARCH: "32"
BLOCK: "0"
-# - PYTHON_DIR: "C:\\Python35"
-# PYTHON_VERSION: "3.5.x"
-# PYTHON_ARCH: "32"
-# BLOCK: "0"
-
- PYTHON_DIR: "C:\\Python35-x64"
PYTHON_VERSION: "3.5.x"
PYTHON_ARCH: "64"
BLOCK: "0"
-# - PYTHON_DIR: "C:\\Python36"
-# PYTHON_VERSION: "3.6.x"
-# PYTHON_ARCH: "32"
-# BLOCK: "0"
+ - PYTHON_DIR: "C:\\Python36"
+ PYTHON_VERSION: "3.6.x"
+ PYTHON_ARCH: "32"
+ BLOCK: "0"
- PYTHON_DIR: "C:\\Python36-x64"
PYTHON_VERSION: "3.6.x"
PYTHON_ARCH: "64"
BLOCK: "0"
-# - PYTHON_DIR: "C:\\Python37"
-# PYTHON_VERSION: "3.7.x"
-# PYTHON_ARCH: "32"
-# BLOCK: "0"
-#
+ - PYTHON_DIR: "C:\\Python37"
+ PYTHON_VERSION: "3.7.x"
+ PYTHON_ARCH: "32"
+ BLOCK: "0"
+
- PYTHON_DIR: "C:\\Python37-x64"
PYTHON_VERSION: "3.7.x"
PYTHON_ARCH: "64"
BLOCK: "0"
+ - PYTHON_DIR: "C:\\Python38"
+ PYTHON_VERSION: "3.8.x"
+ PYTHON_ARCH: "32"
+ BLOCK: "0"
+
+ - PYTHON_DIR: "C:\\Python38-x64"
+ PYTHON_VERSION: "3.8.x"
+ PYTHON_ARCH: "64"
+ BLOCK: "0"
+
PYPI_USER:
secure: deKM8MvS0hJbOBqZmBym0Q==
PYPI_PASSWORD:
=====================================
requirements-dev.txt
=====================================
@@ -1,10 +1,10 @@
codecov
coverage
-flake8
+flake8==3.8.0
mock
-pytest
+pytest<5.4.0
pytest-cov
pytest-sugar
setuptools>=28.0.0
twine
-wheel==0.31.1
+wheel==0.34.1
=====================================
setup.cfg
=====================================
@@ -17,7 +17,7 @@ show-source = True
# Maximum cyclomatic complexity allowed
max-complexity = 14
format = pylint
-exclude = .git,.idea,.eggs,__pycache__,doc/_build,doc/auto_examples,doc/conf.py,build,wordcloud/_version.py,versioneer.py
+exclude = .git,.idea,.eggs,__pycache__,dist,doc/_build,doc/auto_examples,doc/conf.py,build,wordcloud/_version.py,versioneer.py
[tool:pytest]
addopts = -v --cov --cov-report xml --tb=short
=====================================
test/test_wordcloud.py
=====================================
@@ -6,6 +6,7 @@ import pytest
from random import Random
from numpy.testing import assert_array_equal
from PIL import Image
+import xml.etree.ElementTree as ET
import matplotlib
matplotlib.use('Agg')
@@ -40,12 +41,34 @@ Namespaces are one honking great idea -- let's do more of those!
46 09 55 05 82 23 17 25 35 94 08 128
"""
+STOPWORDED_COLLOCATIONS = """
+thank you very much
+thank you very much
+thank you very much
+thanks
+"""
+
+STOPWORDED_COLLOCATIONS_UPPERCASE = """
+Thank you very much
+Thank you very much
+Thank you very much
+thank you very much
+hi There
+Hi there
+Hi There
+thanks
+"""
+
+SMALL_CANVAS = """
+better late than never someone will say
+"""
+
def test_collocations():
- wc = WordCloud(collocations=False, stopwords=[])
+ wc = WordCloud(collocations=False, stopwords=set())
wc.generate(THIS)
- wc2 = WordCloud(collocations=True, stopwords=[])
+ wc2 = WordCloud(collocations=True, stopwords=set())
wc2.generate(THIS)
assert "is better" in wc2.words_
@@ -53,6 +76,30 @@ def test_collocations():
assert "way may" not in wc2.words_
+def test_collocation_stopwords():
+ wc = WordCloud(collocations=True, stopwords={"you", "very"}, collocation_threshold=9)
+ wc.generate(STOPWORDED_COLLOCATIONS)
+
+ assert "thank you" not in wc.words_
+ assert "very much" not in wc.words_
+ assert "thank" in wc.words_
+ # a bigram of all stopwords will be removed
+ assert "you very" not in wc.words_
+
+
+def test_collocation_stopwords_uppercase():
+ wc = WordCloud(collocations=True, stopwords={"thank", "hi", "there"}, collocation_threshold=9)
+ wc.generate(STOPWORDED_COLLOCATIONS_UPPERCASE)
+
+ assert "Thank you" not in wc.words_
+ assert "thank you" not in wc.words_
+ assert "Thank" not in wc.words_
+ # a bigram of all stopwords will be removed
+ assert "hi There" not in wc.words_
+ assert "Hi there" not in wc.words_
+ assert "Hi There" not in wc.words_
+
+
def test_plurals_numbers():
text = THIS + "\n" + "1 idea 2 ideas three ideas although many Ideas"
wc = WordCloud(stopwords=[]).generate(text)
@@ -154,6 +201,13 @@ def test_check_errors():
assert "call generate" in str(e)
+def test_svg_syntax():
+ wc = WordCloud()
+ wc.generate(THIS)
+ svg = wc.to_svg()
+ ET.fromstring(svg)
+
+
def test_recolor():
wc = WordCloud(max_words=50, colormap="jet")
wc.generate(THIS)
@@ -343,7 +397,9 @@ def test_recolor_too_small_set_default():
def test_small_canvas():
# check font size fallback works on small canvas
- WordCloud(max_words=50, width=20, height=20).generate(THIS)
+ wc = WordCloud(max_words=50, width=21, height=21)
+ wc.generate(SMALL_CANVAS)
+ assert len(wc.layout_) > 0
def test_tiny_canvas():
@@ -351,6 +407,7 @@ def test_tiny_canvas():
w = WordCloud(max_words=50, width=1, height=1)
with pytest.raises(ValueError, match="Couldn't find space to draw"):
w.generate(THIS)
+ assert len(w.layout_) == 0
def test_coloring_black_works():
@@ -396,3 +453,15 @@ def test_zero_frequencies():
word_cloud.generate_from_frequencies({'test': 1, 'test1': 0, 'test2': 0})
assert len(word_cloud.layout_) == 1
assert word_cloud.layout_[0][0][0] == 'test'
+
+
+def test_plural_stopwords():
+ x = '''was was was was was was was was was was was was was was was
+ wa
+ hello hello hello hello hello hello hello hello
+ goodbye good bye maybe yes no'''
+ w = WordCloud().generate(x)
+ assert w.words_['wa'] < 1
+
+ w = WordCloud(collocations=False).generate(x)
+ assert w.words_['wa'] < 1
=====================================
test/test_wordcloud_cli.py
=====================================
@@ -1,6 +1,7 @@
import argparse
import os
import subprocess
+import sys
from collections import namedtuple
import wordcloud as wc
@@ -25,7 +26,10 @@ ARGUMENT_SPEC_TYPED = [
ArgOption(cli_name='relative_scaling', init_name='relative_scaling', pass_value=1, fail_value='c'),
]
ARGUMENT_SPEC_UNARY = [
- ArgOption(cli_name='no_collocations', init_name='collocations', pass_value=True, fail_value=1)
+ ArgOption(cli_name='no_collocations', init_name='collocations', pass_value=True, fail_value=1),
+ ArgOption(cli_name='include_numbers', init_name='include_numbers', pass_value=True, fail_value=2),
+ ArgOption(cli_name='no_normalize_plurals', init_name='normalize_plurals', pass_value=True, fail_value=3),
+ ArgOption(cli_name='repeat', init_name='repeat', pass_value=True, fail_value=4),
]
ARGUMENT_SPEC_REMAINING = [
ArgOption(cli_name='stopwords', init_name='stopwords', pass_value=PassFile(), fail_value=None),
@@ -36,8 +40,16 @@ ARGUMENT_SPEC_REMAINING = [
ArgOption(cli_name='background', init_name='background_color', pass_value='grey', fail_value=None),
ArgOption(cli_name='contour_color', init_name='contour_color', pass_value='grey', fail_value=None),
ArgOption(cli_name='contour_width', init_name='contour_width', pass_value=0.5, fail_value='blue'),
- ArgOption(cli_name='include_numbers', init_name='include_numbers', pass_value=True, fail_value=None),
ArgOption(cli_name='min_word_length', init_name='min_word_length', pass_value=5, fail_value='blue'),
+ ArgOption(cli_name='prefer_horizontal', init_name='prefer_horizontal', pass_value=.1, fail_value='blue'),
+ ArgOption(cli_name='scale', init_name='scale', pass_value=1., fail_value='blue'),
+ ArgOption(cli_name='colormap', init_name='colormap', pass_value='Greens', fail_value=1),
+ ArgOption(cli_name='mode', init_name='mode', pass_value='RGBA', fail_value=2),
+ ArgOption(cli_name='max_words', init_name='max_words', pass_value=10, fail_value='blue'),
+ ArgOption(cli_name='min_font_size', init_name='min_font_size', pass_value=10, fail_value='blue'),
+ ArgOption(cli_name='max_font_size', init_name='max_font_size', pass_value=10, fail_value='blue'),
+ ArgOption(cli_name='font_step', init_name='font_step', pass_value=10, fail_value='blue'),
+ ArgOption(cli_name='random_state', init_name='random_state', pass_value=100, fail_value='blue'),
]
ARGUMENT_CLI_NAMES_UNARY = [arg_opt.cli_name for arg_opt in ARGUMENT_SPEC_UNARY]
@@ -129,7 +141,7 @@ def test_unicode_with_stopwords():
assert u'\u304D' in args['stopwords']
-def test_cli_writes_image(tmpdir, tmp_text_file):
+def test_cli_writes_to_imagefile(tmpdir, tmp_text_file):
# ensure writing works with all python versions
tmp_image_file = tmpdir.join("word_cloud.png")
@@ -138,7 +150,26 @@ def test_cli_writes_image(tmpdir, tmp_text_file):
args, text, image_file = cli.parse_args(['--text', str(tmp_text_file), '--imagefile', str(tmp_image_file)])
cli.main(args, text, image_file)
- # expecting image to be written
+ # expecting image to be written to imagefile
+ assert tmp_image_file.size() > 0
+
+
+# capsysbinary should be used here, but it's not supported in python 2.
+def test_cli_writes_to_stdout(tmpdir, tmp_text_file):
+ # ensure writing works with all python versions
+ tmp_image_file = tmpdir.join("word_cloud.png")
+
+ tmp_text_file.write(b'some text')
+
+ originalBuffer = sys.stdout.buffer
+ sys.stdout.buffer = tmp_image_file.open('wb+')
+
+ args, text, image_file = cli.parse_args(['--text', str(tmp_text_file)])
+ cli.main(args, text, image_file)
+
+ sys.stdout.buffer = originalBuffer
+
+ # expecting image to be written to stdout
assert tmp_image_file.size() > 0
=====================================
versioneer.py
=====================================
@@ -418,7 +418,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False,
return stdout, p.returncode
-LONG_VERSION_PY['git'] = '''
+LONG_VERSION_PY['git'] = r'''
# This file helps to compute a version number in source trees obtained from
# git-archive tarball (such as those provided by githubs download-from-tag
# feature). Distribution tarballs (built by setup.py sdist) and build
=====================================
wordcloud/_version.py
=====================================
@@ -23,9 +23,9 @@ def get_keywords():
# setup.py/versioneer.py will grep for the variable names, so they must
# each be defined on a line of their own. _version.py will just call
# get_keywords().
- git_refnames = " (tag: 1.6.0)"
- git_full = "378a920fd382e38101739331a6db4857c5f1a10b"
- git_date = "2019-11-22 17:04:28 -0500"
+ git_refnames = " (tag: 1.8.0)"
+ git_full = "2280d32872720a4e107de3e78ee39b1e172dc242"
+ git_date = "2020-08-14 17:41:09 -0400"
keywords = {"refnames": git_refnames, "full": git_full, "date": git_date}
return keywords
=====================================
wordcloud/tokenization.py
=====================================
@@ -5,7 +5,7 @@ from collections import defaultdict
from math import log
-def l(k, n, x): # noqa: E743
+def l(k, n, x): # noqa: E741, E743
# dunning's likelihood ratio with notation from
# http://nlp.stanford.edu/fsnlp/promo/colloc.pdf p162
return log(max(x, 1e-10)) * k + log(max(1 - x, 1e-10)) * (n - k)
@@ -36,26 +36,29 @@ def pairwise(iterable):
return zip(a, b)
-def unigrams_and_bigrams(words, normalize_plurals=True):
- n_words = len(words)
- # make tuples of two words following each other
- bigrams = list(pairwise(words))
+def unigrams_and_bigrams(words, stopwords, normalize_plurals=True, collocation_threshold=30):
+ # We must create the bigrams before removing the stopword tokens from the words, or else we get bigrams like
+ # "thank much" from "thank you very much".
+ # We don't allow any of the words in the bigram to be stopwords
+ bigrams = list(p for p in pairwise(words) if not any(w.lower() in stopwords for w in p))
+ unigrams = list(w for w in words if w.lower() not in stopwords)
+ n_words = len(unigrams)
counts_unigrams, standard_form = process_tokens(
- words, normalize_plurals=normalize_plurals)
+ unigrams, normalize_plurals=normalize_plurals)
counts_bigrams, standard_form_bigrams = process_tokens(
[" ".join(bigram) for bigram in bigrams],
normalize_plurals=normalize_plurals)
# create a copy of counts_unigram so the score computation is not changed
- counts = counts_unigrams.copy()
+ orig_counts = counts_unigrams.copy()
- # decount words inside bigrams
+ # Include bigrams that are also collocations
for bigram_string, count in counts_bigrams.items():
bigram = tuple(bigram_string.split(" "))
- # collocation detection (30 is arbitrary):
word1 = standard_form[bigram[0].lower()]
word2 = standard_form[bigram[1].lower()]
- if score(count, counts[word1], counts[word2], n_words) > 30:
+ collocation_score = score(count, orig_counts[word1], orig_counts[word2], n_words)
+ if collocation_score > collocation_threshold:
# bigram is a collocation
# discount words in unigrams dict. hack because one word might
# appear in multiple collocations at the same time
@@ -63,10 +66,8 @@ def unigrams_and_bigrams(words, normalize_plurals=True):
counts_unigrams[word1] -= counts_bigrams[bigram_string]
counts_unigrams[word2] -= counts_bigrams[bigram_string]
counts_unigrams[bigram_string] = counts_bigrams[bigram_string]
- words = list(counts_unigrams.keys())
- for word in words:
- # remove empty / negative counts
- if counts_unigrams[word] <= 0:
+ for word, count in list(counts_unigrams.items()):
+ if count <= 0:
del counts_unigrams[word]
return counts_unigrams
=====================================
wordcloud/wordcloud.py
=====================================
@@ -1,3 +1,4 @@
+# coding=utf-8
# Author: Andreas Christian Mueller <t3kcit at gmail.com>
#
# (c) 2012
@@ -9,13 +10,16 @@ from __future__ import division
import warnings
from random import Random
+import io
import os
import re
+import base64
import sys
import colorsys
import matplotlib
import numpy as np
from operator import itemgetter
+from xml.sax import saxutils
from PIL import Image
from PIL import ImageColor
@@ -270,6 +274,14 @@ class WordCloud(object):
min_word_length : int, default=0
Minimum number of letters a word must have to be included.
+ collocation_threshold: int, default=30
+ Bigrams must have a Dunning likelihood collocation score greater than this
+ parameter to be counted as bigrams. Default of 30 is arbitrary.
+
+ See Manning, C.D., Manning, C.D. and Schütze, H., 1999. Foundations of
+ Statistical Natural Language Processing. MIT press, p. 162
+ https://nlp.stanford.edu/fsnlp/promo/colloc.pdf#page=22
+
Attributes
----------
``words_`` : dict of string to float
@@ -300,7 +312,7 @@ class WordCloud(object):
relative_scaling='auto', regexp=None, collocations=True,
colormap=None, normalize_plurals=True, contour_width=0,
contour_color='black', repeat=False,
- include_numbers=False, min_word_length=0):
+ include_numbers=False, min_word_length=0, collocation_threshold=30):
if font_path is None:
font_path = FONT_PATH
if color_func is None and colormap is None:
@@ -351,6 +363,7 @@ class WordCloud(object):
self.repeat = repeat
self.include_numbers = include_numbers
self.min_word_length = min_word_length
+ self.collocation_threshold = collocation_threshold
def fit_words(self, frequencies):
"""Create a word_cloud from words and frequencies.
@@ -554,15 +567,11 @@ class WordCloud(object):
include all those things.
"""
- stopwords = set([i.lower() for i in self.stopwords])
-
flags = (re.UNICODE if sys.version < '3' and type(text) is unicode # noqa: F821
else 0)
regexp = self.regexp if self.regexp is not None else r"\w[\w']+"
words = re.findall(regexp, text, flags)
- # remove stopwords
- words = [word for word in words if word.lower() not in stopwords]
# remove 's
words = [word[:-2] if word.lower().endswith("'s") else word
for word in words]
@@ -573,9 +582,12 @@ class WordCloud(object):
if self.min_word_length:
words = [word for word in words if len(word) >= self.min_word_length]
+ stopwords = set([i.lower() for i in self.stopwords])
if self.collocations:
- word_counts = unigrams_and_bigrams(words, self.normalize_plurals)
+ word_counts = unigrams_and_bigrams(words, stopwords, self.normalize_plurals, self.collocation_threshold)
else:
+ # remove stopwords
+ words = [word for word in words if word.lower() not in stopwords]
word_counts, _ = process_tokens(words, self.normalize_plurals)
return word_counts
@@ -729,6 +741,254 @@ class WordCloud(object):
def to_html(self):
raise NotImplementedError("FIXME!!!")
+ def to_svg(self, embed_font=False, optimize_embedded_font=True, embed_image=False):
+ """Export to SVG.
+
+ Font is assumed to be available to the SVG reader. Otherwise, text
+ coordinates may produce artifacts when rendered with replacement font.
+ It is also possible to include a subset of the original font in WOFF
+ format using ``embed_font`` (requires `fontTools`).
+
+ Note that some renderers do not handle glyphs the same way, and may
+ differ from ``to_image`` result. In particular, Complex Text Layout may
+ not be supported. In this typesetting, the shape or positioning of a
+ grapheme depends on its relation to other graphemes.
+
+ Pillow, since version 4.2.0, supports CTL using ``libraqm``. However,
+ due to dependencies, this feature is not always enabled. Hence, the
+ same rendering differences may appear in ``to_image``. As this
+ rasterized output is used to compute the layout, this also affects the
+ layout generation. Use ``PIL.features.check`` to test availability of
+ ``raqm``.
+
+ Consistant rendering is therefore expected if both Pillow and the SVG
+ renderer have the same support of CTL.
+
+ Contour drawing is not supported.
+
+ Parameters
+ ----------
+ embed_font : bool, default=False
+ Whether to include font inside resulting SVG file.
+
+ optimize_embedded_font : bool, default=True
+ Whether to be aggressive when embedding a font, to reduce size. In
+ particular, hinting tables are dropped, which may introduce slight
+ changes to character shapes (w.r.t. `to_image` baseline).
+
+ embed_image : bool, default=False
+ Whether to include rasterized image inside resulting SVG file.
+ Useful for debugging.
+
+ Returns
+ -------
+ content : string
+ Word cloud image as SVG string
+ """
+
+ # TODO should add option to specify URL for font (i.e. WOFF file)
+
+ # Make sure layout is generated
+ self._check_generated()
+
+ # Get output size, in pixels
+ if self.mask is not None:
+ width = self.mask.shape[1]
+ height = self.mask.shape[0]
+ else:
+ height, width = self.height, self.width
+
+ # Get max font size
+ if self.max_font_size is None:
+ max_font_size = max(w[1] for w in self.layout_)
+ else:
+ max_font_size = self.max_font_size
+
+ # Text buffer
+ result = []
+
+ # Get font information
+ font = ImageFont.truetype(self.font_path, int(max_font_size * self.scale))
+ raw_font_family, raw_font_style = font.getname()
+ # TODO properly escape/quote this name?
+ font_family = repr(raw_font_family)
+ # TODO better support for uncommon font styles/weights?
+ raw_font_style = raw_font_style.lower()
+ if 'bold' in raw_font_style:
+ font_weight = 'bold'
+ else:
+ font_weight = 'normal'
+ if 'italic' in raw_font_style:
+ font_style = 'italic'
+ elif 'oblique' in raw_font_style:
+ font_style = 'oblique'
+ else:
+ font_style = 'normal'
+
+ # Add header
+ result.append(
+ '<svg'
+ ' xmlns="http://www.w3.org/2000/svg"'
+ ' width="{}"'
+ ' height="{}"'
+ '>'
+ .format(
+ width * self.scale,
+ height * self.scale
+ )
+ )
+
+ # Embed font, if requested
+ if embed_font:
+
+ # Import here, to avoid hard dependency on fonttools
+ import fontTools
+ import fontTools.subset
+
+ # Subset options
+ options = fontTools.subset.Options(
+
+ # Small impact on character shapes, but reduce size a lot
+ hinting=not optimize_embedded_font,
+
+ # On small subsets, can improve size
+ desubroutinize=optimize_embedded_font,
+
+ # Try to be lenient
+ ignore_missing_glyphs=True,
+ )
+
+ # Load and subset font
+ ttf = fontTools.subset.load_font(self.font_path, options)
+ subsetter = fontTools.subset.Subsetter(options)
+ characters = {c for item in self.layout_ for c in item[0][0]}
+ text = ''.join(characters)
+ subsetter.populate(text=text)
+ subsetter.subset(ttf)
+
+ # Export as WOFF
+ # TODO is there a better method, i.e. directly export to WOFF?
+ buffer = io.BytesIO()
+ ttf.saveXML(buffer)
+ buffer.seek(0)
+ woff = fontTools.ttLib.TTFont(flavor='woff')
+ woff.importXML(buffer)
+
+ # Create stylesheet with embedded font face
+ buffer = io.BytesIO()
+ woff.save(buffer)
+ data = base64.b64encode(buffer.getbuffer()).decode('ascii')
+ url = 'data:application/font-woff;charset=utf-8;base64,' + data
+ result.append(
+ '<style>'
+ '@font-face{{'
+ 'font-family:{};'
+ 'font-weight:{};'
+ 'font-style:{};'
+ 'src:url("{}")format("woff");'
+ '}}'
+ '</style>'
+ .format(
+ font_family,
+ font_weight,
+ font_style,
+ url
+ )
+ )
+
+ # Select global style
+ result.append(
+ '<style>'
+ 'text{{'
+ 'font-family:{};'
+ 'font-weight:{};'
+ 'font-style:{};'
+ '}}'
+ '</style>'
+ .format(
+ font_family,
+ font_weight,
+ font_style
+ )
+ )
+
+ # Add background
+ if self.background_color is not None:
+ result.append(
+ '<rect'
+ ' width="100%"'
+ ' height="100%"'
+ ' style="fill:{}"'
+ '>'
+ '</rect>'
+ .format(self.background_color)
+ )
+
+ # Embed image, useful for debug purpose
+ if embed_image:
+ image = self.to_image()
+ data = io.BytesIO()
+ image.save(data, format='JPEG')
+ data = base64.b64encode(data.getbuffer()).decode('ascii')
+ result.append(
+ '<image'
+ ' width="100%"'
+ ' height="100%"'
+ ' href="data:image/jpg;base64,{}"'
+ '/>'
+ .format(data)
+ )
+
+ # For each word in layout
+ for (word, count), font_size, (y, x), orientation, color in self.layout_:
+ x *= self.scale
+ y *= self.scale
+
+ # Get text metrics
+ font = ImageFont.truetype(self.font_path, int(font_size * self.scale))
+ (size_x, size_y), (offset_x, offset_y) = font.font.getsize(word)
+ ascent, descent = font.getmetrics()
+
+ # Compute text bounding box
+ min_x = -offset_x
+ max_x = size_x - offset_x
+ max_y = ascent - offset_y
+
+ # Compute text attributes
+ attributes = {}
+ if orientation == Image.ROTATE_90:
+ x += max_y
+ y += max_x - min_x
+ transform = 'translate({},{}) rotate(-90)'.format(x, y)
+ else:
+ x += min_x
+ y += max_y
+ transform = 'translate({},{})'.format(x, y)
+
+ # Create node
+ attributes = ' '.join('{}="{}"'.format(k, v) for k, v in attributes.items())
+ result.append(
+ '<text'
+ ' transform="{}"'
+ ' font-size="{}"'
+ ' style="fill:{}"'
+ '>'
+ '{}'
+ '</text>'
+ .format(
+ transform,
+ font_size * self.scale,
+ color,
+ saxutils.escape(word)
+ )
+ )
+
+ # TODO draw contour
+
+ # Complete SVG file
+ result.append('</svg>')
+ return '\n'.join(result)
+
def _get_bolean_mask(self, mask):
"""Cast to two dimensional boolean mask."""
if mask.dtype.kind == 'f':
=====================================
wordcloud/wordcloud_cli.py
=====================================
@@ -54,14 +54,15 @@ class FileType(object):
if 'r' in self._mode:
return sys.stdin
elif 'w' in self._mode:
- return sys.stdout
+ return sys.stdout.buffer if 'b' in self._mode else sys.stdout
else:
msg = 'argument "-" with mode %r' % self._mode
raise ValueError(msg)
# all other arguments are used as file names
try:
- return io.open(string, self._mode, self._bufsize, encoding="UTF-8")
+ encoding = None if 'b' in self._mode else "UTF-8"
+ return io.open(string, self._mode, self._bufsize, encoding=encoding)
except IOError as e:
message = "can't open '%s': %s"
raise argparse.ArgumentTypeError(message % (string, e))
@@ -107,7 +108,7 @@ def make_parser():
help='specify file of stopwords (containing one word per line)'
' to remove from the given text after parsing')
parser.add_argument(
- '--imagefile', metavar='file', type=argparse.FileType('wb'),
+ '--imagefile', metavar='file', type=FileType('wb'),
default='-',
help='file the completed PNG image should be written to'
' (default: stdout)')
@@ -156,9 +157,7 @@ def make_parser():
'(default: add unigrams and bigrams)')
parser.add_argument(
'--include_numbers',
- type=bool,
- default=False,
- metavar='include_numbers',
+ action='store_true',
dest='include_numbers',
help='include numbers in wordcloud?')
parser.add_argument(
@@ -168,6 +167,52 @@ def make_parser():
metavar='min_word_length',
dest='min_word_length',
help='only include words with more than X letters')
+ parser.add_argument(
+ '--prefer_horizontal',
+ type=float, default=.9, metavar='ratio',
+ help='ratio of times to try horizontal fitting as opposed to vertical')
+ parser.add_argument(
+ '--scale',
+ type=float, default=1, metavar='scale',
+ help='scaling between computation and drawing')
+ parser.add_argument(
+ '--colormap',
+ type=str, default='viridis', metavar='map',
+ help='matplotlib colormap name')
+ parser.add_argument(
+ '--mode',
+ type=str, default='RGB', metavar='mode',
+ help='use RGB or RGBA for transparent background')
+ parser.add_argument(
+ '--max_words',
+ type=int, default=200, metavar='N',
+ help='maximum number of words')
+ parser.add_argument(
+ '--min_font_size',
+ type=int, default=4, metavar='size',
+ help='smallest font size to use')
+ parser.add_argument(
+ '--max_font_size',
+ type=int, default=None, metavar='size',
+ help='maximum font size for the largest word')
+ parser.add_argument(
+ '--font_step',
+ type=int, default=1, metavar='step',
+ help='step size for the font')
+ parser.add_argument(
+ '--random_state',
+ type=int, default=None, metavar='seed',
+ help='random seed')
+ parser.add_argument(
+ '--no_normalize_plurals',
+ action='store_false',
+ dest='normalize_plurals',
+ help='whether to remove trailing \'s\' from words')
+ parser.add_argument(
+ '--repeat',
+ action='store_true',
+ dest='repeat',
+ help='whether to repeat words and phrases')
parser.add_argument(
'--version', action='version',
version='%(prog)s {version}'.format(version=__version__))
View it on GitLab: https://salsa.debian.org/med-team/python-wordcloud/-/commit/80d06ac24cc1235b9964c3df138e17008765d5f3
--
View it on GitLab: https://salsa.debian.org/med-team/python-wordcloud/-/commit/80d06ac24cc1235b9964c3df138e17008765d5f3
You're receiving this email because of your account on salsa.debian.org.
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://alioth-lists.debian.net/pipermail/debian-med-commit/attachments/20200927/7a9362ba/attachment-0001.html>
More information about the debian-med-commit
mailing list