hp2FEM
0.1
|
Various routines with dealing with CSR matrices. More...
#include <GKlib.h>
Defines | |
#define | OMPMINOPS 50000 |
Functions | |
gk_csr_t * | gk_csr_Create () |
void | gk_csr_Init (gk_csr_t *mat) |
void | gk_csr_Free (gk_csr_t **mat) |
void | gk_csr_FreeContents (gk_csr_t *mat) |
gk_csr_t * | gk_csr_Dup (gk_csr_t *mat) |
gk_csr_t * | gk_csr_ExtractSubmatrix (gk_csr_t *mat, int rstart, int nrows) |
gk_csr_t * | gk_csr_ExtractRows (gk_csr_t *mat, int nrows, int *rind) |
gk_csr_t * | gk_csr_ExtractPartition (gk_csr_t *mat, int *part, int pid) |
gk_csr_t ** | gk_csr_Split (gk_csr_t *mat, int *color) |
gk_csr_t * | gk_csr_Read (char *filename, int format, int readvals, int numbering) |
void | gk_csr_Write (gk_csr_t *mat, char *filename, int format, int writevals, int numbering) |
gk_csr_t * | gk_csr_Prune (gk_csr_t *mat, int what, int minf, int maxf) |
gk_csr_t * | gk_csr_LowFilter (gk_csr_t *mat, int what, int norm, float fraction) |
gk_csr_t * | gk_csr_TopKPlusFilter (gk_csr_t *mat, int what, int topk, float keepval) |
gk_csr_t * | gk_csr_ZScoreFilter (gk_csr_t *mat, int what, float zscore) |
void | gk_csr_CompactColumns (gk_csr_t *mat) |
void | gk_csr_SortIndices (gk_csr_t *mat, int what) |
void | gk_csr_CreateIndex (gk_csr_t *mat, int what) |
void | gk_csr_Normalize (gk_csr_t *mat, int what, int norm) |
void | gk_csr_Scale (gk_csr_t *mat, int type) |
void | gk_csr_ComputeSums (gk_csr_t *mat, int what) |
void | gk_csr_ComputeSquaredNorms (gk_csr_t *mat, int what) |
float | gk_csr_ComputeSimilarity (gk_csr_t *mat, int i1, int i2, int what, int simtype) |
int | gk_csr_GetSimilarRows (gk_csr_t *mat, int nqterms, int *qind, float *qval, int simtype, int nsim, float minsim, gk_fkv_t *hits, int *i_marker, gk_fkv_t *i_cand) |
Various routines with dealing with CSR matrices.
$Id: csr.c 10537 2011-07-11 05:27:54Z karypis $
void gk_csr_CompactColumns | ( | gk_csr_t * | mat | ) |
Compacts the column-space of the matrix by removing empty columns. As a result of the compaction, the column numbers are renumbered. The compaction operation is done in place and only affects the row-based representation of the matrix.
mat | the matrix whose empty columns will be removed. |
float gk_csr_ComputeSimilarity | ( | gk_csr_t * | mat, |
int | i1, | ||
int | i2, | ||
int | what, | ||
int | simtype | ||
) |
Computes the similarity between two rows/columns
mat | the matrix itself. The routine assumes that the indices are sorted in increasing order. |
i1 | is the first row/column, |
i2 | is the second row/column, |
what | is either GK_CSR_ROW or GK_CSR_COL indicating the type of objects between the similarity will be computed, |
simtype | is the type of similarity and is one of GK_CSR_COS, GK_CSR_JAC, GK_CSR_MIN |
void gk_csr_ComputeSquaredNorms | ( | gk_csr_t * | mat, |
int | what | ||
) |
Computes the squared of the norms of the rows/columns
mat | the matrix itself, |
what | is either GK_CSR_ROW or GK_CSR_COL indicating which squared norms to compute. |
void gk_csr_ComputeSums | ( | gk_csr_t * | mat, |
int | what | ||
) |
Computes the sums of the rows/columns
mat | the matrix itself, |
what | is either GK_CSR_ROW or GK_CSR_COL indicating which sums to compute. |
gk_csr_t* gk_csr_Create | ( | ) |
Allocate memory for a CSR matrix and initializes it
void gk_csr_CreateIndex | ( | gk_csr_t * | mat, |
int | what | ||
) |
Creates a row/column index from the column/row data.
mat | the matrix itself, |
what | is either GK_CSR_ROW or GK_CSR_COL indicating which index will be created. |
gk_csr_t* gk_csr_Dup | ( | gk_csr_t * | mat | ) |
Returns a copy of a matrix.
mat | is the matrix to be duplicated. |
gk_csr_t* gk_csr_ExtractPartition | ( | gk_csr_t * | mat, |
int * | part, | ||
int | pid | ||
) |
Returns a submatrix corresponding to a specified partitioning of rows.
mat | is the original matrix. |
part | is the partitioning vector of the rows. |
pid | is the partition ID that will be extracted. |
gk_csr_t* gk_csr_ExtractRows | ( | gk_csr_t * | mat, |
int | nrows, | ||
int * | rind | ||
) |
Returns a submatrix containing a certain set of rows.
mat | is the original matrix. |
nrows | is the number of rows to extract. |
rind | is the set of row numbers to extract. |
gk_csr_t* gk_csr_ExtractSubmatrix | ( | gk_csr_t * | mat, |
int | rstart, | ||
int | nrows | ||
) |
Returns a submatrix containint a set of consecutive rows.
mat | is the original matrix. |
rstart | is the starting row. |
nrows | is the number of rows from rstart to extract. |
void gk_csr_Free | ( | gk_csr_t ** | mat | ) |
Frees all the memory allocated for matrix.
mat | is the matrix to be freed. |
void gk_csr_FreeContents | ( | gk_csr_t * | mat | ) |
Frees only the memory allocated for the matrix's different fields and sets them to NULL.
mat | is the matrix whose contents will be freed. |
int gk_csr_GetSimilarRows | ( | gk_csr_t * | mat, |
int | nqterms, | ||
int * | qind, | ||
float * | qval, | ||
int | simtype, | ||
int | nsim, | ||
float | minsim, | ||
gk_fkv_t * | hits, | ||
int * | i_marker, | ||
gk_fkv_t * | i_cand | ||
) |
Finds the n most similar rows (neighbors) to the query using cosine similarity.
mat | the matrix itself |
nqterms | is the number of columns in the query |
qind | is the list of query columns |
qval | is the list of correspodning query weights |
simtype | is the type of similarity and is one of GK_CSR_COS, GK_CSR_JAC, GK_CSR_MIN |
nsim | is the maximum number of requested most similar rows. If -1 is provided, then everything is returned unsorted. |
minsim | is the minimum similarity of the requested most similar rows |
hits | is the result set. This array should be at least of length nsim. |
i_marker | is an array of size equal to the number of rows whose values are initialized to -1. If NULL is provided then this array is allocated and freed internally. |
i_cand | is an array of size equal to the number of rows. If NULL is provided then this array is allocated and freed internally. |
void gk_csr_Init | ( | gk_csr_t * | mat | ) |
Initializes the matrix
mat | is the matrix to be initialized. |
gk_csr_t* gk_csr_LowFilter | ( | gk_csr_t * | mat, |
int | what, | ||
int | norm, | ||
float | fraction | ||
) |
Eliminates certain entries from the rows/columns of the matrix. The filtering takes place by keeping only the highest weight entries whose sum accounts for a certain fraction of the overall weight of the row/column.
mat | the matrix to be prunned, |
what | indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL) of the matrix will be prunned, |
norm | indicates the norm that will be used to aggregate the weights and possible values are 1 or 2, |
fraction | is the fraction of the overall norm that will be retained by the kept entries. |
void gk_csr_Normalize | ( | gk_csr_t * | mat, |
int | what, | ||
int | norm | ||
) |
Normalizes the rows/columns of the matrix to be unit length.
mat | the matrix itself, |
what | indicates what will be normalized and is obtained by specifying GK_CSR_ROW, GK_CSR_COL, GK_CSR_ROW|GK_CSR_COL. |
norm | indicates what norm is to normalize to, 1: 1-norm, 2: 2-norm |
gk_csr_t* gk_csr_Prune | ( | gk_csr_t * | mat, |
int | what, | ||
int | minf, | ||
int | maxf | ||
) |
Prunes certain rows/columns of the matrix. The prunning takes place by analyzing the row structure of the matrix. The prunning takes place by removing rows/columns but it does not affect the numbering of the remaining rows/columns.
mat | the matrix to be prunned, |
what | indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL) of the matrix will be prunned, |
minf | is the minimum number of rows (columns) that a column (row) must be present in order to be kept, |
maxf | is the maximum number of rows (columns) that a column (row) must be present at in order to be kept. |
gk_csr_t* gk_csr_Read | ( | char * | filename, |
int | format, | ||
int | readvals, | ||
int | numbering | ||
) |
Reads a CSR matrix from the supplied file and stores it the matrix's forward structure.
filename | is the file that stores the data. |
format | is either GK_CSR_FMT_CLUTO or GK_CSR_FMT_CSR specifying the type of the input format. The difference between the two formats is that GK_CSR_FMT_CLUTO requires the header line, whereas GK_CSR_FMT_CSR does not. |
readvals | is either 1 or 0, indicating if the CSR file contains values or it does not. It only applies when GK_CSR_FMT_CSR is used. |
numbering | is either 1 or 0, indicating if the numbering of the indices start from 1 or 0, respectively. If they start from 1, they are automatically decreamented during input so that they will start from 0. It only applies when GK_CSR_FMT_CSR is used. |
void gk_csr_Scale | ( | gk_csr_t * | mat, |
int | type | ||
) |
Applies different row scaling methods.
mat | the matrix itself, |
type | indicates the type of row scaling. Possible values are: GK_CSR_MAXTF, GK_CSR_SQRT, GK_CSR_LOG, GK_CSR_IDF. |
void gk_csr_SortIndices | ( | gk_csr_t * | mat, |
int | what | ||
) |
Sorts the indices in increasing order
mat | the matrix itself, |
what | is either GK_CSR_ROW or GK_CSR_COL indicating which set of indices to sort. |
gk_csr_t** gk_csr_Split | ( | gk_csr_t * | mat, |
int * | color | ||
) |
Splits the matrix into multiple sub-matrices based on the provided color array.
mat | is the original matrix. |
color | is an array of size equal to the number of non-zeros in the matrix (row-wise structure). The matrix is split into as many parts as the number of colors. For meaningfull results, the colors should be numbered consecutively starting from 0. |
gk_csr_t* gk_csr_TopKPlusFilter | ( | gk_csr_t * | mat, |
int | what, | ||
int | topk, | ||
float | keepval | ||
) |
Eliminates certain entries from the rows/columns of the matrix. The filtering takes place by keeping only the highest weight top-K entries along each row/column and those entries whose weight is greater than a specified value.
mat | the matrix to be prunned, |
what | indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL) of the matrix will be prunned, |
topk | is the number of the highest weight entries to keep. |
keepval | is the weight of a term above which will be kept. This is used to select additional terms past the first topk. |
void gk_csr_Write | ( | gk_csr_t * | mat, |
char * | filename, | ||
int | format, | ||
int | writevals, | ||
int | numbering | ||
) |
Writes the row-based structure of a matrix into a file.
mat | is the matrix to be written, |
filename | is the name of the output file. |
format | is one of GK_CSR_FMT_CLUTO or GK_CSR_FMT_CSR specifying the format of the output file. |
writevals | is either 1 or 0 indicating if the values will be written or not. This is only applicable when GK_CSR_FMT_CSR is used. |
numbering | is either 1 or 0 indicating if the internal 0-based numbering will be shifted by one or not during output. This is only applicable when GK_CSR_FMT_CSR is used. |
gk_csr_t* gk_csr_ZScoreFilter | ( | gk_csr_t * | mat, |
int | what, | ||
float | zscore | ||
) |
Eliminates certain entries from the rows/columns of the matrix. The filtering takes place by keeping only the terms whose contribution to the total length of the document is greater than a user-splied multiple over the average.
This routine assumes that the vectors are normalized to be unit length.
mat | the matrix to be prunned, |
what | indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL) of the matrix will be prunned, |
zscore | is the multiplicative factor over the average contribution to the length of the document. |