2章 その3

p.13より
その2のユークリッド距離を用いた方法では、人それぞれ高い点数を付ける人や低い人など色々いて、その差が相関に影響を与えてしまう。
傾向の差を埋める為にピアソン相関係数を使う。

recommendations.rb に sim_pearson関数を追加

class Critics

  # person1とperson2のピアソン相関係数を返す
  def sim_pearson(prefs, p1, p2)
    si = { }
    prefs[p1].keys.each{ |item|
      si[item] = 1 if prefs[p2].key?(item)
    }
    
    # 要素数
    n = si.length
    
    # 要素数を調べて、共に評価しているアイテムがなければ0を返す
    return 0 if n == 0
    
    # 全ての趣向を合計する
    sum1 = 0
    sum2 = 0
    si.keys.each{ |it|
      sum1 = sum1 + prefs[p1][it] if prefs[p1][it] != nil
      sum2 = sum2 + prefs[p2][it] if prefs[p2][it] != nil
    }
    
    # 平方を合計する
    sum1Sq = 0
    sum2Sq = 0
    si.keys.each{ |it|
      p1score = prefs[p1][it]
      sum1Sq = sum1Sq + p1score*p1score if p1score != nil
      p2score = prefs[p2][it]
      sum2Sq = sum2Sq + p2score*p2score if p2score != nil
    }

    # 積を合計する
    pSum = 0
    si.keys.each{ |it|
      pSum = pSum + prefs[p1][it]*prefs[p2][it]
    }
    
    # ピアソンによるスコアを計算する
    num = pSum - (sum1*sum2/n)
    den = Math::sqrt( (sum1Sq-sum1*sum1/n) * (sum2Sq-sum2*sum2/n) )
    return 0 if den == 0
    
    return num/den
  end
  
  # Person1とperson2の距離を基にした類似性スコアを返す
  def sim_distance(prefs, person1, person2)
    # 2人とも評価しているアイテムのリストを得る
    si = { }
    prefs[person1].keys.each{ |item|
      si[item] = 1 if prefs[person2].key?(item)
    }
    
    # 両者ともに評価しているものが1つもなければ0を返す
    return 0 if si.length == 0
    
    # すべての差の平方を足し合わせる
    sum_of_squares = 0
    prefs[person1].keys.each{ |item|
      if prefs[person2].key?(item)
        p1 = prefs[person1][item]
        p2 = prefs[person2][item]
        sum_of_squares = sum_of_squares + (p1-p2)*(p1-p2)
      end
    }
    
    return 1/(1+sum_of_squares)
  end
  
  def users
    return @users
  end
  
  def initialize
    @users = { 
      'Lisa Rose' => {
        'Lady in the Water' => 2.5,
        'Snake on a Plane' => 3.5,
        'Just My Luck' => 3.0,
        'Superman Returns' => 3.5,
        'You, Me and Dupree' => 2.5,
        'The Night Listener'=>3.0
      },
      'Gene Seymour' => { 
        'Lady in the Water' => 3.0,
        'Snake on a Plane' => 3.5,
        'Just My Luck' => 1.5,
        'Superman Returns' => 5.0,
        'The Night Listener' => 3.0,
        'You, Me and Dupree' => 3.5
      },
      'Michael Phillips' => { 
        'Lady in the Water' => 2.5,
        'Snake on a Plane' => 3.0,
        'Superman Returns' => 3.5,
        'The Night Listener' => 4.0
      },
      'Claudia Puig' => { 
        'Snake on a Plane' => 3.5,
        'Just My Luck' => 3.0,
        'The Night Listener' => 4.5,
        'Superman Returns' => 4.0,
        'You, Me and Dupree' => 2.5
      },
      'Mick LaSalle' => { 
        'Lady in the Water' => 3.0,
        'Snake on a Plane' => 4.0,
        'Just My Luck' => 2.0,
        'Superman Returns' => 3.0,
        'The Night Listener' => 3.0,
        'You, Me and Dupree' => 2.0
      },
      'Jack Matthews' => { 
        'Lady in the Water' => 3.0,
        'Snake on a Plane' =>4.0,
        'The Night Listener' => 3.0,
        'Superman Returns' => 5.0,
        'You, Me and Dupree' => 3.5
      },
      'Toby' => { 
        'Snake on a Plane' => 4.5,
        'You, Me and Dupree' => 1.0,
        'Superman Returns' => 4.0
      }
    }
  end
  
end


irbで、2人の間のピアソン相関を計算してみる
p.14より
irb -r recommendations.rb

>> c = Critics.new
=> #<Critics:0x69bedc @users={"Jack Matthews"=>{"The Night Listener"=>3.0, "Superman Returns"=>5.0, "Lady in the Water"=>3.0, "Snake on a Plane"=>4.0, "You, Me and Dupree"=>3.5}, "Gene Seymour"=>{"The Night Listener"=>3.0, "Superman Returns"=>5.0, "Lady in the Water"=>3.0, "Snake on a Plane"=>3.5, "You, Me and Dupree"=>3.5, "Just My Luck"=>1.5}, "Mick LaSalle"=>{"The Night Listener"=>3.0, "Superman Returns"=>3.0, "Lady in the Water"=>3.0, "Snake on a Plane"=>4.0, "You, Me and Dupree"=>2.0, "Just My Luck"=>2.0}, "Toby"=>{"Superman Returns"=>4.0, "Snake on a Plane"=>4.5, "You, Me and Dupree"=>1.0}, "Claudia Puig"=>{"The Night Listener"=>4.5, "Superman Returns"=>4.0, "Snake on a Plane"=>3.5, "You, Me and Dupree"=>2.5, "Just My Luck"=>3.0}, "Lisa Rose"=>{"The Night Listener"=>3.0, "Superman Returns"=>3.5, "Lady in the Water"=>2.5, "Snake on a Plane"=>3.5, "You, Me and Dupree"=>2.5, "Just My Luck"=>3.0}, "Michael Phillips"=>{"The Night Listener"=>4.0, "Superman Returns"=>3.5, "Lady in the Water"=>2.5, "Snake on a Plane"=>3.0}}>
>> c.sim_pearson(c.users, 'Lisa Rose', 'Gene Seymour')
=> 0.39605901719067