YeahperShiny/tfidfDotProduct.R at master · graySquirrel/YeahperShiny · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
library(NLP)
library(tm)
# tfidfDotProduct.R
# implements
# create.tfidf.from.doclist(doclist) returns tfidf matrix. Each row is a term
#    doclist is a list of character strings
# create.tfidf.from.query(doc.tfidf, query) returns a tfidf vector for the query
#    doc.tfidf is output from create.tfidf.from.doclist
#    query is a character string
# create.doc.scores.from.tfidf.and.query(doc.tfidf, query) returns a doc.score matrix
#    doc.tfidf is output from create.tfidf.from.doclist
#    query is a character string
#    This uses create.tfidf.from.query, then also does the dot product.

# Internal functions
dd <- F
## Compute tfidf weights from a term frequency vector and a document
# frequency scalar
get.tf.idf.weights <- function(tf.vec, df, numDocs) {
    weight = rep(0, length(tf.vec))
    weight[tf.vec > 0] = (1 + log2(tf.vec[tf.vec > 0])) * log2(numDocs/df)
    #    weight[tf.vec > 0] = tf.vec[tf.vec > 0] * log2(N.keywords/df)
    #    weight[tf.vec > 0] = tf.vec[tf.vec > 0]
    #    weight[tf.vec > 0] = (1 + log2(tf.vec[tf.vec > 0]))
    return(weight)
}

## Compute weights vector for a term
### input is a vector of tf across all documents in corpus
get.weights.per.term.vec <- function(tfidf.row) {
    NumberOfDocuments <- length(tfidf.row)
    term.df <- sum(tfidf.row > 0)
    tf.idf.vec <- get.tf.idf.weights(tfidf.row, term.df, NumberOfDocuments)
    return(tf.idf.vec)
}

#### create.tfidf.from.doclist
create.tfidf.from.doclist <- function(doclist) {
    my.docs <- VectorSource(doclist)
    my.docs$Names <- names(doclist)
    NumberOfDocuments <- length(doclist)
    # create corpus
    my.corpus <- Corpus(my.docs)
    # clean up corpus
    my.corpus <- tm_map(my.corpus, content_transformer(removePunctuation))
    #my.corpus <- tm_map(my.corpus, stemDocument) # Snowball not available for R 3.1.2
    my.corpus <- tm_map(my.corpus, content_transformer(removeNumbers))
    my.corpus <- tm_map(my.corpus, content_transformer(tolower))
    my.corpus <- tm_map(my.corpus, content_transformer(stripWhitespace))
    #Create the Term-Document matrix, which is a 2D representation of the
    #  term frequency for each document.  Terms in rows, docs in cols
    my.corpus <- tm_map(my.corpus, content_transformer(PlainTextDocument))
    term.doc.matrix.stm <- TermDocumentMatrix(my.corpus)
    #Convert the sparse matrix representation to a normal matrix to work from.
    term.doc.matrix <- as.matrix(term.doc.matrix.stm)
    # Calculate tfidf weights
    tfidf.matrix <- t(apply(term.doc.matrix, c(1), FUN = get.weights.per.term.vec))
    colnames(tfidf.matrix) <- colnames(term.doc.matrix)
    # Dont forget to scale document vectors by sum of square to normalize
    tfidf.matrix <- scale(tfidf.matrix, center = FALSE,
                          scale = sqrt(colSums(tfidf.matrix^2)))
    return(tfidf.matrix)
}

#### create.tfidf.from.query
create.tfidf.from.query <- function(doc.tfidf, query) {
    if(dd){print(class(query));print (query)}
    my.query <- unlist(strsplit(query," "))  # split query into vector of terms
    # don't forget to normalize query text same way as doc corpus was.
    my.query <- removePunctuation(my.query)
    my.query <- removeNumbers(my.query)
    my.query <- tolower(my.query)
    my.query <- stripWhitespace(my.query)

#     if(dd){print(class(my.query));print (query);print(my.query)}
#     my.query <- VectorSource(my.query)
#     my.query <- Corpus(my.query)
#     my.query <- tm_map(my.query, content_transformer(removePunctuation))
#     #my.corpus <- tm_map(my.corpus, stemDocument) # Snowball not available for R 3.1.2
#     my.query <- tm_map(my.query, content_transformer(removeNumbers))
#     my.query <- tm_map(my.query, content_transformer(tolower))
#     my.query <- tm_map(my.query, content_transformer(stripWhitespace))

    term.names <- row.names(doc.tfidf) # get term names from tfidf matrix
    numDocs <- dim(doc.tfidf)[2]
    numTerms <- length(term.names)
    if(dd){print(numDocs);print(numTerms);print(my.query)}
    weights = rep(0, numTerms)
    for (i in my.query) {
        term.index <- which(term.names == i)
        if(dd){print("term index");print(term.index);print(i)}
        if (length(term.index) > 0) {       # length will be 0 if no match
            term <- doc.tfidf[term.index,]
            term.df <- sum(term > 0)
            weights[term.index] <- get.tf.idf.weights(c(1), term.df, numDocs)
        }
    }
    weights <- scale(weights, center = FALSE, scale = sqrt(sum(weights^2)))
    return(weights)
}

#### create.doc.scores.from.tfidf.and.query
create.doc.scores.from.tfidf.and.query <- function(doc.tfidf, query) {
    # convert the query to a tfidf matrix
    query.vector <- create.tfidf.from.query(doc.tfidf, query)
    # get the similarity vector from the dot product
    doc.scores <- t(query.vector) %*% doc.tfidf
    return(doc.scores)
}