[Pkg-haskell-commits] [SCM] Optimized edit distances for fuzzy matching, including Levenshtein and restricted Damerau-Levenshtein algorithms branch, master, updated. f45e5b72e82c99bf0333e5c869f83cb9cd35ae47

Joey Hess joey at kitenet.net
Fri Apr 6 19:10:56 UTC 2012


The following commit has been merged in the master branch:
commit 8923f1f65f8efb8ed09dc11b344bd202f91f1a96
Author: Joey Hess <joey at kitenet.net>
Date:   Fri Apr 6 14:57:16 2012 -0400

    remove several files that are not distributed in the cabal tarball
    
    This is around the third haskell package I've noticed is not distributing
    in the tarball files like TODO and parts of its test suite.

diff --git a/.ghci b/.ghci
deleted file mode 100644
index e85c930..0000000
--- a/.ghci
+++ /dev/null
@@ -1,3 +0,0 @@
-:set -Wall -fno-warn-name-shadowing  -fno-warn-orphans
-:load Text.EditDistance
-
diff --git a/.gitattributes b/.gitattributes
deleted file mode 100644
index f6a90d5..0000000
--- a/.gitattributes
+++ /dev/null
@@ -1,2 +0,0 @@
-*.hs	whitespace=!indent,trail,space
-
diff --git a/.gitignore b/.gitignore
deleted file mode 100644
index 0a395e9..0000000
--- a/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-*.o
-*.plot
-*.ps
-dist
-.DS_Store
diff --git a/TODO b/TODO
deleted file mode 100644
index 89e6602..0000000
--- a/TODO
+++ /dev/null
@@ -1,6 +0,0 @@
-== High Priority ==
-
-== Medium Priority ==
-* Use WORD_SIZE_IN_BITS so that bit-vectors can benefit from 64 bit machines
-
-== Low Priority ==
\ No newline at end of file
diff --git a/Text/EditDistance/Tests/EditOperationOntology.hs b/Text/EditDistance/Tests/EditOperationOntology.hs
deleted file mode 100644
index 1951bfd..0000000
--- a/Text/EditDistance/Tests/EditOperationOntology.hs
+++ /dev/null
@@ -1,97 +0,0 @@
-module Text.EditDistance.Tests.EditOperationOntology where
-
-import Text.EditDistance.EditCosts
-
-import Test.QuickCheck
-import System.Random
-import Control.Monad
-import Data.Char
-
-
-instance Arbitrary Char where
-    arbitrary     = choose ('\32', '\128')
-    coarbitrary c = variant (ord c `rem` 4)
-
-
-class Arbitrary ops => EditOperation ops where
-    edit :: String -> ops -> Gen (String, EditCosts -> Int)
-    containsTransposition :: ops -> Bool
-
-instance EditOperation op => EditOperation [op] where
-   edit xs ops = foldM (\(xs, cost) op -> fmap (\(xs', cost') -> (xs', \ecs -> cost ecs + cost' ecs)) $ edit xs op) (xs, const 0) ops
-   containsTransposition = any containsTransposition
-
-
-data EditedString ops = MkEditedString {
-    oldString :: String,
-    newString :: String,
-    operations :: ops,
-    cost :: EditCosts -> Int
-}
-
-instance Show ops => Show (EditedString ops) where
-    show (MkEditedString old_string new_string ops _cost) = show old_string ++ " ==> " ++ show new_string ++ " (by " ++ show ops ++ ")"
-
-instance EditOperation ops => Arbitrary (EditedString ops) where
-    arbitrary = do
-        old_string <- arbitrary
-        edit_operations <- arbitrary
-        (new_string, cost) <- edit old_string edit_operations
-        return $ MkEditedString {
-            oldString = old_string,
-            newString = new_string,
-            operations = edit_operations,
-            cost = cost
-        }
-
-
-data ExtendedEditOperation = Deletion
-                           | Insertion Char
-                           | Substitution Char
-                           | Transposition
-                           deriving (Show)
-
-instance Arbitrary ExtendedEditOperation where
-    arbitrary = oneof [return Deletion, fmap Insertion arbitrary, fmap Substitution arbitrary, return Transposition]
-
-instance EditOperation ExtendedEditOperation where
-    edit str op = do
-        gen <- rand
-        let max_split_ix | Transposition <- op = length str - 1
-                         | otherwise           = length str
-            (split_ix, _) = randomR (1, max_split_ix) gen
-            (str_l, str_r) = splitAt split_ix str
-            non_null = not $ null str
-            transposable = length str > 1
-        case op of
-            Deletion | non_null -> do
-                let old_ch = last str_l
-                return (init str_l ++ str_r, \ec -> deletionCost ec old_ch)
-            Insertion new_ch | non_null -> do
-                return (str_l ++ new_ch : str_r, \ec -> insertionCost ec new_ch)
-            Insertion new_ch | otherwise -> return ([new_ch], \ec -> insertionCost ec new_ch)   -- Need special case because randomR (1, 0) is undefined
-            Substitution new_ch | non_null -> do
-                let old_ch = last str_l
-                return (init str_l ++ new_ch : str_r, \ec -> substitutionCost ec old_ch new_ch)
-            Transposition | transposable -> do                  -- Need transposable rather than non_null because randomR (1, 0) is undefined
-                let backwards_ch = head str_r
-                    forwards_ch = last str_l
-                return (init str_l ++ backwards_ch : forwards_ch : tail str_r, \ec -> transpositionCost ec backwards_ch forwards_ch)
-            _ -> return (str, const 0)
-
-    containsTransposition Transposition = True
-    containsTransposition _             = False
-
-
--- This all really sucks but I can't think of something better right now
-newtype BasicEditOperation = MkBasic ExtendedEditOperation
-
-instance Show BasicEditOperation where
-    show (MkBasic x) = show x
-
-instance Arbitrary BasicEditOperation where
-    arbitrary = fmap MkBasic $ oneof [return Deletion, fmap Insertion arbitrary, fmap Substitution arbitrary]
-
-instance EditOperation BasicEditOperation where
-    edit str (MkBasic op) = edit str op
-    containsTransposition _ = False
\ No newline at end of file
diff --git a/Text/EditDistance/Tests/Properties.hs b/Text/EditDistance/Tests/Properties.hs
deleted file mode 100644
index 78fa6b0..0000000
--- a/Text/EditDistance/Tests/Properties.hs
+++ /dev/null
@@ -1,94 +0,0 @@
-{-# LANGUAGE PatternGuards, PatternSignatures, ScopedTypeVariables #-}
-
-module Text.EditDistance.Tests.Properties (
-        tests
-    ) where
-
-import Text.EditDistance.EditCosts
-import qualified Text.EditDistance.SquareSTUArray as SquareSTUArray
-import qualified Text.EditDistance.STUArray as STUArray
-import qualified Text.EditDistance.Bits as Bits
-import Text.EditDistance.Tests.EditOperationOntology
-
-import Test.Framework
-import Test.Framework.Providers.QuickCheck
-import Test.QuickCheck
-
-
-tests :: [Test]
-tests = [ testGroup "Levenshtein Distance (SquareSTUArray)" sqstu_levenshteinDistanceTests
-        , testGroup "Restricted Damerau-Levenshtein Distance (SquareSTUArray)" sqstu_restrictedDamerauLevenshteinDistanceTests
-        , testGroup "Levenshtein Distance (STUArray)" stu_levenshteinDistanceTests
-        , testGroup "Restricted Damerau-Levenshtein Distance (STUArray)" stu_restrictedDamerauLevenshteinDistanceTests
-        , testGroup "Levenshtein Distance (Bits)" bits_levenshteinDistanceTests
-        , testGroup "Restricted Damerau-Levenshtein Distance (Bits)" bits_restrictedDamerauLevenshteinDistanceTests
-        , testGroup "Levenshtein Distance Crosschecks" levenshteinDistanceCrosscheckTests
-        , testGroup "Restricted Damerau-Levenshtein Distance Crosschecks" restrictedDamerauLevenshteinDistanceCrosscheckTests
-        --, testGroup "Levenshtein Distance Cutoff (Bits)" bits_levenshteinDistanceCutoffTests
-        ]
-  where
-    sqstu_levenshteinDistanceTests                  = standardDistanceTests SquareSTUArray.levenshteinDistance                  interestingCosts (undefined :: BasicEditOperation)
-    sqstu_restrictedDamerauLevenshteinDistanceTests = standardDistanceTests SquareSTUArray.restrictedDamerauLevenshteinDistance interestingCosts (undefined :: ExtendedEditOperation)
-    stu_levenshteinDistanceTests                    = standardDistanceTests STUArray.levenshteinDistance                        interestingCosts (undefined :: BasicEditOperation)
-    stu_restrictedDamerauLevenshteinDistanceTests   = standardDistanceTests STUArray.restrictedDamerauLevenshteinDistance       interestingCosts (undefined :: ExtendedEditOperation)
-    bits_levenshteinDistanceTests                   = standardDistanceTests (const Bits.levenshteinDistance)                    defaultEditCosts (undefined :: BasicEditOperation)
-    bits_restrictedDamerauLevenshteinDistanceTests  = standardDistanceTests (const Bits.restrictedDamerauLevenshteinDistance)   defaultEditCosts (undefined :: ExtendedEditOperation)
-    
-    --bits_levenshteinDistanceCutoffTests = [ testProperty "Cutoff vs. Non-Cutoff" (forAll arbitrary (\cutoff -> distanceEqIfBelowProperty cutoff (Bits.levenshteinDistanceCutoff cutoff) Bits.levenshteinDistance defaultEditCosts (undefined :: BasicEditOperation))) ]
-    
-    levenshteinDistanceCrosscheckTests 
-      = crossCheckTests [ ("SquareSTUArray", SquareSTUArray.levenshteinDistance defaultEditCosts)
-                        , ("STUArray",       STUArray.levenshteinDistance defaultEditCosts)
-                        , ("Bits",           Bits.levenshteinDistance) ]
-                        (undefined :: BasicEditOperation)
-    
-    restrictedDamerauLevenshteinDistanceCrosscheckTests 
-      = crossCheckTests [ ("SquareSTUArray", SquareSTUArray.restrictedDamerauLevenshteinDistance defaultEditCosts)
-                        , ("STUArray",       STUArray.restrictedDamerauLevenshteinDistance defaultEditCosts)
-                        , ("Bits",           Bits.restrictedDamerauLevenshteinDistance) ]
-                        (undefined :: ExtendedEditOperation)
-
-
-interestingCosts :: EditCosts
-interestingCosts = EditCosts {
-    deletionCosts = ConstantCost 1,
-    insertionCosts = ConstantCost 2,
-    substitutionCosts = ConstantCost 3, -- Can't be higher than deletion + insertion
-    transpositionCosts = ConstantCost 3 -- Can't be higher than deletion + insertion
-}
-
-
-crossCheckTests :: forall op. (EditOperation op, Show op) => [(String, String -> String -> Int)] -> op -> [Test]
-crossCheckTests named_distances _op_dummy
-  = [ testProperty (name1 ++ " vs. " ++ name2) (distanceEqProperty distance1 distance2 _op_dummy)
-    | (ix1, (name1, distance1)) <- enumerated_named_distances, (ix2, (name2, distance2)) <- enumerated_named_distances, ix2 > ix1 ]
-  where
-    enumerated_named_distances = [(1 :: Int)..] `zip` named_distances
-
-distanceEqProperty :: (String -> String -> Int) -> (String -> String -> Int) -> op -> EditedString op -> Bool
-distanceEqProperty distance1 distance2 _op_dummy (MkEditedString old new _ _) = distance1 old new == distance2 old new
-
---distanceEqIfBelowProperty :: (EditOperation op) => Int -> (String -> String -> Int) -> (String -> String -> Int) -> EditCosts -> op -> EditedString op -> Property
---distanceEqIfBelowProperty cutoff distance1 distance2 costs _op_dummy (MkEditedString old new ops) = (editCost costs ops <= cutoff) ==> distance1 old new == distance2 old new
-
-standardDistanceTests :: forall op. (EditOperation op, Show op) => (EditCosts -> String -> String -> Int) -> EditCosts -> op -> [Test]
-standardDistanceTests distance costs _op_dummy
-  = [ testProperty "Self distance is zero" prop_self_distance_zero
-    , testProperty "Pure deletion has the right cost" prop_pure_deletion_cost_correct
-    , testProperty "Pure insertion has the right cost" prop_pure_insertion_cost_correct
-    , testProperty "Single operations have the right cost" prop_single_op_cost_is_distance
-    , testProperty "Cost bound is respected" prop_combined_op_cost_at_least_distance
-    ]
-  where
-    testableDistance = distance costs
-
-    prop_self_distance_zero str
-      = testableDistance str str == 0
-    prop_pure_deletion_cost_correct str
-      = testableDistance str "" == sum [deletionCost costs c | c <- str]
-    prop_pure_insertion_cost_correct str
-      = testableDistance "" str == sum [insertionCost costs c | c <- str]
-    prop_single_op_cost_is_distance (MkEditedString old new _ops cost :: EditedString op)
-      = (length old > 2) ==> testableDistance old new == cost costs || old == new
-    prop_combined_op_cost_at_least_distance (MkEditedString old new ops cost :: EditedString [op])
-      = not (containsTransposition ops) ==> testableDistance old new <= cost costs
\ No newline at end of file
diff --git a/import_wiki.lhs b/import_wiki.lhs
deleted file mode 100755
index 9b6f652..0000000
--- a/import_wiki.lhs
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env runhaskell
-
-\begin{code}
-import Text.HTML.Download
-import Text.HTML.TagSoup
-import Network.URL
-import Control.Monad
-
-
-uSER = "batterseapower"
-pROJECT_NAME = "edit-distance"
-rEADME_PAGE = "readme"
-rEADME_FILENAME = "README.textile"
-
-
-encURL :: String -> String
-encURL = encString False ok_url
-
-gitHubEditWikiURL :: String -> String -> String -> String
-gitHubEditWikiURL user project page = encURL $ "http://github.com/" ++ user ++ "/" ++ project ++ "/wikis/" ++ page ++ "/edit"
-
-main :: IO ()
-main = do
-   tags <- liftM parseTags $ openURL (gitHubEditWikiURL uSER pROJECT_NAME rEADME_PAGE)
-   let text = innerText $ takeWhile (~/= "</textarea>") $ head $ sections (~== "<textarea id=wiki_body>") tags
-   writeFile rEADME_FILENAME text
-\end{code}
diff --git a/release b/release
deleted file mode 100755
index 85bf421..0000000
--- a/release
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-#
-
-echo "Have you updated the version number? Type 'yes' if you have!"
-read version_response
-
-if [ "$version_response" != "yes" ]; then
-    echo "Go and update the version number"
-    exit 1
-fi
-
-sdist_output=`runghc Setup.lhs sdist`
-
-if [ "$?" != "0" ]; then
-    echo "Cabal sdist failed, aborting"
-    exit 1
-fi
-
-# Want to find a line like:
-# Source tarball created: dist/ansi-terminal-0.1.tar.gz
-
-# Test this with:
-# runghc Setup.lhs sdist | grep ...
-filename=`echo $sdist_output | sed 's/.*Source tarball created: \([^ ]*\).*/\1/'`
-echo "Filename: $filename"
-
-if [ "$filename" = "$sdist_output" ]; then
-    echo "Could not find filename, aborting"
-    exit 1
-fi
-
-# Test this with:
-# echo dist/ansi-terminal-0.1.tar.gz | sed ...
-version=`echo $filename | sed 's/^[^0-9]*\([0-9\.]*\).tar.gz$/\1/'`
-echo "Version: $version"
-
-if [ "$version" = "$filename" ]; then
-    echo "Could not find version, aborting"
-    exit 1
-fi
-
-echo "This is your last chance to abort! I'm going to upload in 10 seconds"
-sleep 10
-
-git tag "v$version"
-
-if [ "$?" != "0" ]; then
-    echo "Git tag failed, aborting"
-    exit 1
-fi
-
-# You need to have stored your Hackage username and password as directed by cabal upload
-# I use -v5 because otherwise the error messages can be cryptic :-)
-cabal upload -v2 $filename
-
-if [ "$?" != "0" ]; then
-    echo "Hackage upload failed, aborting"
-    exit 1
-fi
-
-# Success!
-exit 0

-- 
Optimized edit distances for fuzzy matching, including Levenshtein and restricted Damerau-Levenshtein algorithms



More information about the Pkg-haskell-commits mailing list