# # main 6_plsa.R # # Make : 2010.12.08 # Modify : 2010.12.16 # Author : Yoshimura. T # main = function (bin, K, Itr, Rep, every, s1, s2) { init = function (x, N, M, K, s1, s2) { p_z = rep (1 / K, K) p_dz = matrix (0, K, N) p_wz = matrix (0, K, M) p_zdw = list () for (z in 1:K) { p_zdw[[z]] = matrix (0, N, M) rownames (p_zdw[[z]]) = rownames (x) colnames (p_zdw[[z]]) = colnames (x) p_dz[z, ] = rand_beta (N, s1, s2) p_wz[z, ] = rand_beta (M, s1, s2) } p = list (p_z, p_dz, p_wz, p_zdw) names (p) = c ("z", "dz", "wz", "zdw") return (p) } init_mini = function (N, M, K) { z = rep (0, K) dz = matrix (0, K, N) wz = matrix (0, K, M) x = list (z, dz, wz) names (x) = c ("z", "dz", "wz") return (x) } rand_beta = function (num, s1, s2) { # s1 = s2 = 1.0 の時は一様分布となる x = rbeta (num, s1, s2) return (x / sum (x)) } judge = function (x) { # 尤度が減少すれば、反復を止める if (x <= 0.0) { return (2) } # 尤度の変化が少なくなれば、温度を上げる if (x < 100.0) { return (1) } # 尤度の変化が大きければ、温度は変えない return (0) } vsm = function (x) { size = dim (x)[2] # ベクトルの大きさを求める norm = sqrt (apply (x ** 2, 2, sum)) # 類似度行列の初期化 similar = matrix (0, nrow = size, ncol = size) for (i in 1:size) { for (j in 1:size) { # ベクトルの内積を求める inner = sum (x[, i] * x[, j]) # 類似度 = cosθ を求める similar[i, j] = inner / (norm[i] * norm[j]) } } return (similar) } printout = function (x, L, out, th, rank = 10) { cat (file = out, append = FALSE) for(i in 1:dim (x)[1]) { cat (" ########", i, "######## \n", file = out, append = TRUE) o = order (x[, i], decreasing = TRUE) o = replace (o, c (rank:dim (x)[1]), c (0)) x[o[th[i]:rank], i] = 0 s = sprintf ("%-40s %.6f\n", colnames (x)[o], x[o, i]) cat (s, "\n", file = out, append = TRUE, sep = "") } s = sprintf ("対数尤度 = %.2f", L) cat (s, file = out, append = TRUE) } threthold = function (x) { size = dim (x)[2] th = NULL for (j in 1:size) { mdif = 0.1 max = dim (x)[2] for (i in 2:(size - 1)) { dif = x[i, j] - x[i + 1, j] if (dif > mdif) { mdif = dif max = i + 1 } } th = append (th, max) } return (th) } # 引数のエラー処理 if (K < 2 || Itr < 1 || Rep < 1 || s1 < 0 || s2 < 0) { cat ("Please input larger numbers.\n") return (-1) } # 作業用ディレクトリの設定 setwd ("C:/cygwin/home/吉村 建慶/Research/main/result") # 入出力指定 input = "3_matrix/matrix.csv" s = sprintf ("%d %d %.1f %.1f", K, Itr, s1, s2) outdir = paste ("6_plsa", s, sep = "/") data = t (read.csv (input, row.name = 1)) N = dim (data)[1] M = dim (data)[2] eta = 0.95 for (j in 1:Rep) { # 初期化 B = 1 P = init (data, N, M, K, s1, s2) for (i in 1:Itr) { numer = init_mini (N, M, K) # E ステップ for (d in 1:N) { for (w in 1:M) { denom = 0 for (z in 1:K) { denom = denom + P$z[z] * (P$dz[z, d] * P$wz[z, w]) ^ B } for (z in 1:K) { P$zdw[[z]][d, w] = P$z[z] * (P$dz[z, d] * P$wz[z, w]) ^ B / denom # 用語が出現していないなら計算の意味が無いので飛ばす if (!data[d, w]) { next } temp = data[d, w] * P$zdw[[z]][d, w] numer$z[z] = numer$z[z] + temp numer$dz[z, d] = numer$dz[z, d] + temp numer$wz[z, w] = numer$wz[z, w] + temp } } } # M ステップ P$z = numer$z / sum (numer$z) P$dz = numer$dz / apply (numer$dz, 1, sum) P$wz = numer$wz / apply (numer$wz, 1, sum) # 尤度の計算 L = 0 for (d in 1:N) { for (w in 1:M) { if (!data[d, w]) { next } Pdw = 0 for (z in 1:K) { Pdw = Pdw + P$z[z] * P$dz[z, d] * P$wz[z, w] } L = L + log (Pdw) * data[d, w] } } # 結果の出力 if (every > 0 || i == Itr) { similar = vsm (P$dz) rownames (similar) = rownames (data) colnames (similar) = rownames (data) if (bin) { temp = apply (similar, 2, sort, decreasing = TRUE) th = threthold (temp) } else { th = rep (dim (similar)[2], dim (similar)[2]) } s = sprintf ("%s/Pdz_vsm_%02d-%02d.txt", outdir, j, i) printout (similar, L, s, th) } if (i > 1) { dif = L - oldL # 状況表示 s = sprintf ("%02d-%02d : B = %.2f dL = %.6f\n", j, i, B, dif) cat (s) # 収束判定 temp = judge (dif) if (temp == 2) { break } else if (temp == 1) { B = B * eta } } else { s = sprintf ("%02d-01 : B = 1.00 dL = Non\n", j) cat (s) } # 対数尤度の保存 oldL = L } } return (P) } # パラメータ(文字型)を整数型または実数型に変換してから関数に与える args = commandArgs () args.int = as.integer (args[6:10]) args.num = as.numeric (args[11:12]) system.time (main ( args.int[1], args.int[2], args.int[3], args.int[4], args.int[5], args.num[1], args.num[2] )) ### END ###