Clojure垃圾邮件过滤器

Clojure垃圾邮件过滤器,clojure,Clojure,我试图在Clojure中实现Bayseian分类器。我正在使用《集体智慧》一书作为参考。这是我的密码: (ns spam-filter.model.classifier (:require [clojure.string :as st])) ;Counts of feature/category combinations (def fc (atom {})) @fc ;Counts of documents in each category

我试图在Clojure中实现Bayseian分类器。我正在使用《集体智慧》一书作为参考。这是我的密码:

    (ns spam-filter.model.classifier
    (:require [clojure.string :as st]))

    ;Counts of feature/category combinations
    (def fc (atom {}))
    @fc


    ;Counts of documents in each category
    ;How many times every classification has been used
    (def cc (atom {}))
    @cc

    ;extracts features from the text
    (defn getwords
      "Divides the text on any character that isn't a letter.
      Converted to lowercase"
      [doc]
      (let [words (st/split doc #" ")
            less-than-20 (filter #(< (count %) 20) words)
            final-words (filter #(> (count %) 2) less-than-20)]
        (reduce (fn [final-map word]
                  (assoc final-map (.toLowerCase word) 1))
                {}
                final-words)))


    ;increase the count of a feature/category pair
    (defn incf
      [f cat] 
    (if (not (contains? @fc f))
      (swap! fc #(assoc % f {})))
    (if (not (get-in @fc [f cat]))
      (swap! fc #(assoc-in % [f cat] 0)))
    (swap! fc #(update-in % [f cat] inc)))



    ;increase the count of a category
    (defn incc
     [cat]
    (if (not (contains? @cc cat))
    (swap! cc #(assoc % cat 0)))
    (swap! cc #(update % cat inc)))


    ;The number of times a feature has appeared in a category
    (defn fcount
      [f cat]
    (let [num (get-in @fc [f cat])]
      (if (not (nil? num))
        num
        0.0)))



    ; The number of items in a category
    (defn catcount
      [cat]
    (let [n-of-items (get @cc cat)]
      (if (not (nil? n-of-items))
        n-of-items
        0)))




    ; The total numbers of items
    (defn totalcount
      []
    (reduce + (vals @cc)))



    ; The list of all categories
    (defn categories
    []
    (keys @cc))



    (defn train
      [t cat]
    (incc cat)
    (let [ws (keys (getwords t))]
      (for [w ws] (incf w cat))))



    (defn train1
      [t cat]
      (incc cat)
      (let [features (keys (getwords t))]
        (map incf features (repeat (count features) cat))))



    (defn sampletrain
    []
    [(train "Nobody owns the water." "good")
      (train "the quick rabbit jumps fences" "good")
      (train "buy pharmaceuticals now" "bad")
      (train "make quick money at the online casino" "bad")
      (train "the quick brown fox jumps" "good")])

    @fc
    @cc

    (sampletrain)


    ; probability that a word is in particular category
    ; Pr(word | classification)
    (defn fprob
      [f cat]
    (if (= (catcount cat) 0)
      0
    (float (/ (fcount f cat) (catcount cat)))))

    (fprob "quick" "good")



    ; probability that a word is in particular category
    ; assumed probability 0.5
    (defn weightedprob
      [f cat fprob]
    (let [weight 1
          ap 0.5
          basicprob (fprob f cat)
          totals (reduce + (vals (get @fc f)))
          bp (/ (+ (* weight ap) (* totals basicprob)) (+ weight totals))]
    bp))



    ; Extracts features and multiplies all
    ; their probabilities together to get
    ; an overall probability Pr(Document | Category)
    (defn docprob
      [item cat]
      (let [features (keys (getwords item))]

      (loop [features features
             p 1]
        (if (empty? features)
          p
          (recur
           (rest features)
           (* p (weightedprob (first features) cat fprob)))))))


    ;returns product of Pr(Document | Category) and Pr(Category)
    (defn prob
      [item cat]
      (let [catprob (/ (catcount cat) (totalcount))
            docprob (docprob item cat)]
        (* docprob catprob)))

    (prob "quick rabbit" "good")
    (prob "quick rabbit" "bad")


    (def thresholds (atom {}))


    (defn setthreshold
      [cat t]
    (swap! thresholds #(assoc % cat t)))


    (defn getthreshold
      [cat]
    (if (contains? @thresholds cat)
      (get @thresholds cat)
    1.0))

    (getthreshold "bad")


    (defn classify
      [item]
    (let [probs (atom {})
          max (atom 0.0)
          best (atom nil)]
    (map (fn [cat] ((swap! probs #(assoc % cat (prob item cat)))
                    (when (> (get @probs cat) @max)
                        (swap! max #(let [% (get @probs cat)] %))
                        (swap! best #(let [% cat] %))))) (categories))
    (map (fn [cat] (if (> (* (get @probs cat) (getthreshold @best)) (get @probs @best))
                      nil
                      @best))) (filter #(not= % @best) (categories))))



    (classify "quick money")

map
是惰性的,因此如果您在调用它时没有意识到生成的惰性序列,那么它就没有效果。你在几个地方这样做。如果您必须调用集合中的每个元素的函数,只考虑副作用,请考虑使用<代码> Run!<代码>取而代之

更一般地说,算法实现依赖于各种全局原子的变异。这在clojure中是非常不符合逻辑的,这使得它很难阅读,也很难推理。请尝试传入并返回预期状态。这样一来,你就不会有你的
map
问题了

您的代码中还有一系列与表单相关的竞争条件,如

  (defn incc
     [cat]
    (if (not (contains? @cc cat))
    (swap! cc #(assoc % cat 0))) ; no guarantee that @cc wasn't updated 
                                 ; between here and the contains? line above
    (swap! cc #(update % cat inc)))
使用
fnil

(defn incc
     [cat]
     (swap! cc update cat (fnil inc 0)))

map
是惰性的,因此如果您在调用它时没有意识到生成的惰性序列,那么它就没有效果。你在几个地方这样做。如果您必须调用集合中的每个元素的函数,只考虑副作用,请考虑使用<代码> Run!<代码>取而代之

更一般地说,算法实现依赖于各种全局原子的变异。这在clojure中是非常不符合逻辑的,这使得它很难阅读,也很难推理。请尝试传入并返回预期状态。这样一来,你就不会有你的
map
问题了

您的代码中还有一系列与表单相关的竞争条件,如

  (defn incc
     [cat]
    (if (not (contains? @cc cat))
    (swap! cc #(assoc % cat 0))) ; no guarantee that @cc wasn't updated 
                                 ; between here and the contains? line above
    (swap! cc #(update % cat inc)))
使用
fnil

(defn incc
     [cat]
     (swap! cc update cat (fnil inc 0)))