next up previous contents
Next: この文書について... Up: 実験で使用したプログラム Previous: LSA   目次

PLSA

#
# main 6_plsa.R
#
#   Make   : 2010.12.08
#   Modify : 2010.12.16
#   Author : Yoshimura. T
#

main = function (bin, K, Itr, Rep, every, s1, s2) {
  init = function (x, N, M, K, s1, s2) {
    p_z   = rep (1 / K, K)
    p_dz  = matrix (0, K, N)
    p_wz  = matrix (0, K, M)
    p_zdw = list ()
    
    for (z in 1:K) {
      p_zdw[[z]] = matrix (0, N, M)
      rownames (p_zdw[[z]]) = rownames (x)
      colnames (p_zdw[[z]]) = colnames (x)
      
      p_dz[z, ] = rand_beta (N, s1, s2)
      p_wz[z, ] = rand_beta (M, s1, s2)
    }
    
    p = list (p_z, p_dz, p_wz, p_zdw)
    names (p) = c ("z", "dz", "wz", "zdw")
    
    return (p)
  }
  
  init_mini = function (N, M, K) {
    z  = rep (0, K)
    dz = matrix (0, K, N)
    wz = matrix (0, K, M)
    
    x = list (z, dz, wz)
    names (x) = c ("z", "dz", "wz")
    
    return (x)
  }
  
  rand_beta = function (num, s1, s2) {
    # s1 = s2 = 1.0 の時は一様分布となる
    x = rbeta (num, s1, s2)
    return (x / sum (x))
  }
  
  judge = function (x) {
    # 尤度が減少すれば、反復を止める
    if (x <= 0.0) {
      return (2)
    }
    # 尤度の変化が少なくなれば、温度を上げる
    if (x < 100.0) {
      return (1)
    }
    # 尤度の変化が大きければ、温度は変えない
    return (0)
  }
  
  vsm = function (x) {
    size = dim (x)[2]
    
    # ベクトルの大きさを求める
    norm = sqrt (apply (x ** 2, 2, sum))
    
    # 類似度行列の初期化
    similar = matrix (0, nrow = size, ncol = size)
    
    for (i in 1:size) {
      for (j in 1:size) {
        # ベクトルの内積を求める
        inner = sum (x[, i] * x[, j])
        # 類似度 = cosθ を求める
        similar[i, j] = inner / (norm[i] * norm[j])
      }
    }
    
    return (similar)
  }
  
  printout = function (x, L, out, th, rank = 10) {
    cat (file = out, append = FALSE)
    
    for(i in 1:dim (x)[1]) {
      cat (" ########", i, "######## \n", file = out, append = TRUE)
      
      o = order (x[, i], decreasing = TRUE)
      o = replace (o, c (rank:dim (x)[1]), c (0))
      x[o[th[i]:rank], i] = 0
      s = sprintf ("%-40s %.6f\n", colnames (x)[o], x[o, i])
      cat (s, "\n", file = out, append = TRUE, sep = "")
    }
    
    s = sprintf ("対数尤度 = %.2f", L)
    cat (s, file = out, append = TRUE)
  }
  
  threthold = function (x) {
    size = dim (x)[2]
    th = NULL
    
    for (j in 1:size) {
      mdif = 0.1
      max = dim (x)[2]
      
      for (i in 2:(size - 1)) {
        dif = x[i, j] - x[i + 1, j]
        if (dif > mdif) {
          mdif = dif
          max = i + 1
        }
      }
      
      th = append (th, max)
    }
    
    return (th)
  }
  
  # 引数のエラー処理
  if (K < 2 || Itr < 1 || Rep < 1 || s1 < 0 || s2 < 0) {
    cat ("Please input larger numbers.\n")
    return (-1)
  }
  
  # 作業用ディレクトリの設定
  setwd ("C:/cygwin/home/吉村 建慶/Research/main/result")
  
  # 入出力指定
  input  = "3_matrix/matrix.csv"
  s = sprintf ("%d %d %.1f %.1f", K, Itr, s1, s2)
  outdir = paste ("6_plsa", s, sep = "/")
  
  data = t (read.csv (input, row.name = 1))
  N = dim (data)[1]
  M = dim (data)[2]
  eta = 0.95
  
  for (j in 1:Rep) {
    # 初期化
    B = 1
    P = init (data, N, M, K, s1, s2)
    
    for (i in 1:Itr) {
      numer = init_mini (N, M, K)
      
      # E ステップ
      for (d in 1:N) {
        for (w in 1:M) {
          denom = 0
          for (z in 1:K) {
            denom = denom + P$z[z] * (P$dz[z, d] * P$wz[z, w]) ^ B
          }
          for (z in 1:K) {
            P$zdw[[z]][d, w] = P$z[z] * (P$dz[z, d] * P$wz[z, w]) ^ B / denom
            
            # 用語が出現していないなら計算の意味が無いので飛ばす
            if (!data[d, w]) { next }
            temp = data[d, w] * P$zdw[[z]][d, w]
            numer$z[z] = numer$z[z] + temp
            numer$dz[z, d] = numer$dz[z, d] + temp
            numer$wz[z, w] = numer$wz[z, w] + temp
          }
        }
      }
      
      # M ステップ
      P$z  = numer$z / sum (numer$z)
      P$dz = numer$dz / apply (numer$dz, 1, sum)
      P$wz = numer$wz / apply (numer$wz, 1, sum)
      
      # 尤度の計算
      L = 0
      for (d in 1:N) {
        for (w in 1:M) {
          if (!data[d, w]) { next }
          Pdw = 0  
          for (z in 1:K) {
            Pdw = Pdw + P$z[z] * P$dz[z, d] * P$wz[z, w]
          }
          L = L + log (Pdw) * data[d, w]
        }
      }
      
      # 結果の出力
      if (every > 0 || i == Itr) {
        similar = vsm (P$dz)
        rownames (similar) = rownames (data)
        colnames (similar) = rownames (data)
        
        if (bin) {
          temp = apply (similar, 2, sort, decreasing = TRUE)
          th = threthold (temp)
        } else {
          th = rep (dim (similar)[2], dim (similar)[2])
        }
        
        s = sprintf ("%s/Pdz_vsm_%02d-%02d.txt", outdir, j, i)
        printout (similar, L, s, th)
      }
      
      if (i > 1) {
        dif = L - oldL
        
        # 状況表示
        s = sprintf ("%02d-%02d : B = %.2f  dL = %.6f\n", j, i, B, dif)
        cat (s)
        
        # 収束判定
        temp = judge (dif)
        if (temp == 2) { break }
        else if (temp == 1) { B = B * eta }
      } else {
        s = sprintf ("%02d-01 : B = 1.00  dL = Non\n", j)
        cat (s)
      }
      
      # 対数尤度の保存
      oldL = L
    }
  }
  
  return (P)
}

# パラメータ(文字型)を整数型または実数型に変換してから関数に与える
args = commandArgs ()
args.int = as.integer (args[6:10])
args.num = as.numeric (args[11:12])
system.time (main (
  args.int[1], args.int[2], args.int[3], args.int[4], args.int[5],
  args.num[1], args.num[2]
))

### END ###



Deguchi Lab. 2011年3月4日