Team:Paris Liliane Bettencourt/Project/SIP/Codes
From 2010.igem.org
(Difference between revisions)
(New page: {{Template:Paris2010_2}} <html> <p style="display:block"> <a href=""https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Projects/SIP"> <img src="https://static.igem.org/mediawiki/2010/4/4c/SI...) |
|||
Line 33: | Line 33: | ||
</p> | </p> | ||
</html> | </html> | ||
+ | |||
+ | |||
+ | <p style="display:block;"> | ||
+ | <br /> | ||
+ | This is the two codes I made for the igem competition.<br /> | ||
+ | Notice, '''these codes are in public domain''', because it's a very simple algorithm, and I think it's better to share this kind of code with anybody without license or other stuff. This tiny code can help somebody, so '''let's share the code!'''<br /> | ||
+ | Also, these codes are written in C language, using sqlite3 API. softwares are '''written for linux platform''', but theoricaly it can '''work on Mac OSX, and windows''' (use Cygwin for wget, and MinGW for sqlite3). | ||
+ | For the random module, we have use a C++ algorithm, [http://www-personal.umich.edu/~wagnerr/MersenneTwister.html MersenneTwister], so use g++ to compile.<br /><br /> | ||
+ | |||
+ | *'''SIP.C :''' allow you to analyse any text and generate sip words for that, so '''the most probable word in your wiki, but the less probable in all wiki'''. | ||
+ | So here, it's configured to download igem wiki. Put in a list, the name of igem team you want to calculate. First, the software will calculate all frequence in all this text, to establish a dictionary, to compare the words. | ||
+ | Then, for each team, we calculate frequence for each words, compare with same words in dictionary, and establish SIP values. All result are sorted in a database. | ||
+ | |||
+ | <br /> | ||
+ | |||
+ | *'''TOP_SIP.C :''' allow you, after you have generated sip values, '''getting for each wiki, the N hight sip words''' where N is a number you choose. | ||
+ | Specify the database you want to use, and specify the name of teams you want to take in a list file. The last argument, coefficient, is the value used to decrement your hight sip value, if you haven't the number of words you want with the last computing, it's a number between ''1'' and ''15'', and give a number between ''10^-1'' and ''10^-15''. Adjust this number like you want, good coeff, give you better results and speed the software, but good coeff is not the highter. | ||
+ | About the random selection : to take a good random words, I see how many words there are, so if you want all words has the same luck to be selected, each word have a probability of : ''nbr-of-words-selected/nbr-of-words-to-be-choosed'', so the virtual dice give me a number between ''0'' and ''(nbr-of-word/nbr-of-words-selected) - 1''. | ||
+ | |||
+ | <br /><br /> | ||
+ | </p> | ||
== SIP.C == | == SIP.C == | ||
Line 114: | Line 135: | ||
} | } | ||
+ | /* | ||
+ | Change a character by an other one in a string. | ||
+ | @string : your string | ||
+ | @c : char to remove | ||
+ | @r : char to set instead | ||
+ | @len : lenght of the string | ||
+ | */ | ||
void chgchar( char *string, char c, char r, int len ) | void chgchar( char *string, char c, char r, int len ) | ||
{ | { | ||
Line 139: | Line 167: | ||
} | } | ||
+ | /* | ||
+ | Read the next word in your file | ||
+ | @word : buffer to put your next word | ||
+ | @fp : descriptor to our file. | ||
+ | */ | ||
void read_next_word( char word[MAXBUFFER], FILE *fp ) | void read_next_word( char word[MAXBUFFER], FILE *fp ) | ||
{ | { | ||
Line 167: | Line 200: | ||
} | } | ||
+ | /* | ||
+ | Check if the world is already in the database. | ||
+ | */ | ||
int is_word_exists( char *word, sqlite3_stmt **statement, sqlite3** conn ) | int is_word_exists( char *word, sqlite3_stmt **statement, sqlite3** conn ) | ||
{ | { | ||
Line 185: | Line 221: | ||
} | } | ||
+ | /* | ||
+ | Add a word to the database. | ||
+ | */ | ||
void add_to_the_list( char *word, sqlite3_stmt **statement, sqlite3** conn ) | void add_to_the_list( char *word, sqlite3_stmt **statement, sqlite3** conn ) | ||
{ | { | ||
Line 197: | Line 236: | ||
} | } | ||
+ | /* | ||
+ | Increment the occurence of this word, if is already in the database. | ||
+ | */ | ||
void inc_occ( char *word, sqlite3_stmt **statement, sqlite3** conn ) | void inc_occ( char *word, sqlite3_stmt **statement, sqlite3** conn ) | ||
{ | { | ||
Line 776: | Line 818: | ||
</p> | </p> | ||
+ | == TOP-SIP.C == | ||
+ | <p style="display:block"> | ||
+ | <pre> | ||
+ | /* TOP_SIP.C *** Give the top hight SIP words | ||
+ | |||
+ | PUBLIC DOMAIN | ||
+ | From iGEM team 2010 Paris | ||
+ | |||
+ | comment: | ||
+ | |||
+ | This code allows you to get the words with the hight SIP value | ||
+ | Use SIP.C before using this tools. | ||
+ | For the random module, you can use what you want, we use | ||
+ | MersenneTwister written in C++, that's why you have to compile | ||
+ | with g++ in this case. | ||
+ | |||
+ | MersenneTwister : http://www-personal.umich.edu/~wagnerr/MersenneTwister.html | ||
+ | |||
+ | build: | ||
+ | |||
+ | $ g++ -o top-sip top-sip.c -lsqlite3 | ||
+ | |||
+ | usage : | ||
+ | |||
+ | $ ./top-sip [list of team name] [name of database] [coeff] | ||
+ | |||
+ | |||
+ | */ | ||
+ | |||
+ | #include <stdio.h> | ||
+ | #include <stdlib.h> | ||
+ | #include <string.h> | ||
+ | #include <dirent.h> | ||
+ | #include <sqlite3.h> | ||
+ | #include "MersenneTwister.hpp" | ||
+ | |||
+ | |||
+ | #define MAXBUFFER 256 | ||
+ | #define LEN_CMD 96 | ||
+ | |||
+ | #define TEAM_NAME 1 | ||
+ | #define DATABASE_NAME 2 | ||
+ | #define DEC_COEFF 3 | ||
+ | |||
+ | #define FALSE 0 | ||
+ | #define TRUE 1 | ||
+ | |||
+ | #define MIN_COEFF 0 | ||
+ | #define MAX_COEFF 15 | ||
+ | |||
+ | #define WORD_C 0 | ||
+ | #define FREQ_C 2 | ||
+ | #define SIP_C 3 | ||
+ | |||
+ | // arbitrary multiplicator | ||
+ | #define MUL 100000 | ||
+ | |||
+ | //number of words you want | ||
+ | #define TOP 25 | ||
+ | |||
+ | /* | ||
+ | Change a character by an other one in a string. | ||
+ | @string : your string | ||
+ | @c: char to remove | ||
+ | @r : char to set instead | ||
+ | @len : lenght of the string | ||
+ | */ | ||
+ | void chgchar( char *string, char c, char r, int len ) | ||
+ | { | ||
+ | int i = 0; | ||
+ | |||
+ | while ( i < len ) | ||
+ | { | ||
+ | if ( string[i] == c ) | ||
+ | { | ||
+ | |||
+ | string[i] = r; | ||
+ | i++; | ||
+ | } | ||
+ | else | ||
+ | i++; | ||
+ | } | ||
+ | } | ||
+ | |||
+ | /* | ||
+ | pick a random number between 0 and X, use your favorite random algorithm ! I use Mersenne Twister. | ||
+ | @prob : your X, so you have 1/X prob to get the good number. X+1 numbers can be choosen. | ||
+ | return : the random number | ||
+ | */ | ||
+ | int pickrandom( int prob ) | ||
+ | { | ||
+ | MTRand mtrand1; | ||
+ | int d = mtrand1.randInt( prob ); | ||
+ | return d; | ||
+ | } | ||
+ | |||
+ | void compute_top_sip( char *team_list, char *database_name, float dec_coeff ) | ||
+ | { | ||
+ | FILE *fp, *fp_team; | ||
+ | sqlite3_stmt *statement; | ||
+ | |||
+ | char sqlite_query[MAXBUFFER]; | ||
+ | char top_words[TOP][MAXBUFFER]; | ||
+ | char team_name[MAXBUFFER-LEN_CMD]; | ||
+ | |||
+ | int wanted_stuff, total_stuff, prob, i; | ||
+ | unsigned long int t; | ||
+ | int size_name, index; | ||
+ | double top_freq[TOP]; | ||
+ | double max_sip, curr_sip; | ||
+ | |||
+ | sqlite3 *conn; | ||
+ | if ( sqlite3_open( database_name, &conn ) != SQLITE_OK ) | ||
+ | { | ||
+ | printf("Error opening the db: %s\n", sqlite3_errmsg(conn)); | ||
+ | exit(1); | ||
+ | } | ||
+ | |||
+ | fp = fopen(team_list, "r"); | ||
+ | |||
+ | if ( fp == 0 ) { | ||
+ | printf("Can't open %s file\n", team_list); | ||
+ | exit(1); | ||
+ | } | ||
+ | |||
+ | // for each team | ||
+ | while( !feof(fp) ) | ||
+ | { | ||
+ | |||
+ | fgets( team_name, (MAXBUFFER - LEN_CMD), fp ); | ||
+ | |||
+ | if ( feof(fp) ) { | ||
+ | break; | ||
+ | } | ||
+ | |||
+ | size_name = strlen( team_name ); | ||
+ | team_name[size_name - 1] = '\0'; | ||
+ | chgchar( team_name, '-', '_', size_name); | ||
+ | printf("On the %s Team\n", team_name); | ||
+ | |||
+ | max_sip = 0; | ||
+ | |||
+ | snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s", team_name); | ||
+ | |||
+ | if ( sqlite3_prepare_v2(conn, sqlite_query , -1, | ||
+ | &statement, NULL ) != SQLITE_OK ) | ||
+ | { | ||
+ | printf("COMPUTE HIGHT SIP : Error compiling the request: %s\n", sqlite3_errmsg(conn) ); | ||
+ | goto CLOSE_DB; | ||
+ | exit(1); | ||
+ | } | ||
+ | |||
+ | t = 0; | ||
+ | while ( sqlite3_step( statement ) != SQLITE_DONE ) { | ||
+ | curr_sip = (double)sqlite3_column_double(statement, SIP_C); | ||
+ | if ( curr_sip > max_sip ) { | ||
+ | max_sip = curr_sip; | ||
+ | } | ||
+ | t++; | ||
+ | } | ||
+ | printf("hight sip is %.15f, for %ld entry\n", max_sip, t); | ||
+ | sqlite3_finalize( statement ); | ||
+ | |||
+ | PICK_TOP: | ||
+ | index = 0; | ||
+ | prob = 0; | ||
+ | wanted_stuff = TOP; | ||
+ | total_stuff = 0; | ||
+ | |||
+ | snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s WHERE sip > %.15f", team_name, max_sip); | ||
+ | |||
+ | if ( sqlite3_prepare_v2(conn, sqlite_query , -1, | ||
+ | &statement, NULL ) != SQLITE_OK ) | ||
+ | { | ||
+ | printf("PICK TOP FIVE : Error compiling the request: %s\n", sqlite3_errmsg(conn) ); | ||
+ | goto CLOSE_DB; | ||
+ | exit(1); | ||
+ | } | ||
+ | |||
+ | |||
+ | while ( sqlite3_step( statement ) != SQLITE_DONE ) { | ||
+ | total_stuff++; | ||
+ | } | ||
+ | |||
+ | if( total_stuff < TOP ) { | ||
+ | sqlite3_finalize( statement ); | ||
+ | max_sip = max_sip - dec_coeff; | ||
+ | if ( max_sip <= 0 ) { | ||
+ | continue; | ||
+ | } | ||
+ | goto PICK_TOP; | ||
+ | } | ||
+ | |||
+ | prob = (int)total_stuff / wanted_stuff; // we put the fraction to 1/X and we save X. | ||
+ | printf("prob is : %f, (%d/%d), give a number between 0 and %d\n", | ||
+ | (float)wanted_stuff / total_stuff, TOP, total_stuff, prob); | ||
+ | prob--; | ||
+ | |||
+ | while ( sqlite3_step( statement ) != SQLITE_DONE ) { | ||
+ | |||
+ | if ( pickrandom(prob) == FALSE ) { | ||
+ | strncpy( top_words[index], (const char*)sqlite3_column_text(statement, WORD_C), MAXBUFFER); | ||
+ | top_freq[index] = (double)sqlite3_column_double(statement, FREQ_C); | ||
+ | index++; | ||
+ | if ( index == TOP ) { | ||
+ | break; | ||
+ | } | ||
+ | } | ||
+ | } | ||
+ | printf("Your top-words for this team is :\n"); | ||
+ | |||
+ | fp_team = fopen(team_name, "w"); | ||
+ | |||
+ | for( index=0;index<TOP;index++) { | ||
+ | printf("\t- %s (freq = %f; len=%d)\n", top_words[index], | ||
+ | (float)(top_freq[index] * MUL), (int)(top_freq[index] * MUL)); | ||
+ | fprintf(fp_team, "%s: %d\n", top_words[index], occ); | ||
+ | } | ||
+ | fclose( fp_team ); | ||
+ | sqlite3_finalize( statement ); | ||
+ | |||
+ | } | ||
+ | fclose(fp); | ||
+ | |||
+ | CLOSE_DB : | ||
+ | if ( sqlite3_close(conn) != SQLITE_OK ) { | ||
+ | printf("Error closing the db: %s\n", sqlite3_errmsg(conn)); | ||
+ | } | ||
+ | } | ||
+ | |||
+ | int main( int argc, char *argv[] ) | ||
+ | { | ||
+ | |||
+ | if ( argv[TEAM_NAME] == 0 || argv[DATABASE_NAME] == 0 || argv[DEC_COEFF] == 0 ) { | ||
+ | printf("usage : %s [list of team name] [name of database] [dec-coeff]\n", argv[0] ); | ||
+ | printf("/tUse 0 to 15 to define a dec-coeff.\n"); | ||
+ | exit(1); | ||
+ | } | ||
+ | |||
+ | if ( atoi( argv[DEC_COEFF]) >= MIN_COEFF && atoi( argv[DEC_COEFF]) <= MAX_COEFF ) { | ||
+ | printf("Compute with 10^-%s dec_coeff!\n", argv[DEC_COEFF]); | ||
+ | compute_top_sip( argv[TEAM_NAME], argv[DATABASE_NAME], powf( 10, -atoi(argv[DEC_COEFF])) ); | ||
+ | } else { | ||
+ | printf("Use 0 to 15 to define a dec-coeff.\n"); | ||
+ | } | ||
+ | |||
+ | return 0; | ||
+ | } | ||
+ | |||
+ | </pre> | ||
+ | |||
+ | <br /> | ||
+ | <br /> | ||
+ | </p> | ||
<html> | <html> | ||
</div> | </div> | ||
</div> | </div> | ||
</html> | </html> |
Revision as of 16:39, 26 October 2010
This is the two codes I made for the igem competition.
Notice, these codes are in public domain, because it's a very simple algorithm, and I think it's better to share this kind of code with anybody without license or other stuff. This tiny code can help somebody, so let's share the code!
Also, these codes are written in C language, using sqlite3 API. softwares are written for linux platform, but theoricaly it can work on Mac OSX, and windows (use Cygwin for wget, and MinGW for sqlite3).
For the random module, we have use a C++ algorithm, [http://www-personal.umich.edu/~wagnerr/MersenneTwister.html MersenneTwister], so use g++ to compile.
- SIP.C : allow you to analyse any text and generate sip words for that, so the most probable word in your wiki, but the less probable in all wiki.
- TOP_SIP.C : allow you, after you have generated sip values, getting for each wiki, the N hight sip words where N is a number you choose.
SIP.C
/* SIP.C *** Make SIP database and dictionary PUBLIC DOMAIN From iGEM team 2010 Paris comment: This code permit you to calculate SIP words (most improbable words) in each wiki team. You need sqlite3, wget and links to run this program. build: $ gcc -o sip sip.c -lsqlite3 usage : $ ./sip [list of team name] [name of database] [year] Sqlite3 database : Table for team. +--------+-----------+------------+-----------+ | Words | local occ | local freq | SIP value | +--------+-----------+------------+-----------+ | string | u_long | float | float | | | | | | +--------+-----------+------------+-----------+ Table name : Dictionary. +--------+------------+-------------+ | word | global_occ | global_freq | +--------+------------+-------------+ | string | u_long | float | | | | | +--------+------------+-------------+ */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <dirent.h> #include <sqlite3.h> #define MAXBUFFER 256 #define LEN_CMD 96 #define TEAM_NAME 1 #define DATABASE_NAME 2 #define YEAR 3 #define FALSE 0 #define TRUE 1 #define IS_REP 1 #define NO_REP 2 /* isRep say if the stream is a directory or a file. @entryname : name of the file return : IS_REP it's a directory, NO_REP it's a file */ int isRep( char *entryname ) { FILE *fp; fp = fopen( entryname, "rb" ); if ( fp == 0 ) return IS_REP; else { fclose( fp ); return NO_REP; } } /* Change a character by an other one in a string. @string : your string @c : char to remove @r : char to set instead @len : lenght of the string */ void chgchar( char *string, char c, char r, int len ) { int i = 0; while ( i < len ) { if ( string[i] == c ) { string[i] = r; i++; } else i++; } } int close_sqlitedb( sqlite3*** conn ) { if ( sqlite3_close(**conn) != SQLITE_OK ) { printf("Error closing the db: %s\n", sqlite3_errmsg(*conn)); } } /* Read the next word in your file @word : buffer to put your next word @fp : descriptor to our file. */ void read_next_word( char word[MAXBUFFER], FILE *fp ) { char buffer[MAXBUFFER]; char ch; int n = 0; ch = fgetc(fp); if( ch == EOF ) { return; } while ( ch != 0x20 && ch != EOF && n < MAXBUFFER ) { buffer[n] = ch; n++; ch = fgetc(fp); } buffer[n] = '\0'; strcpy( word, buffer ); } int is_word_valid( char word ) { // there's no filter at the moment. // We can compare with MeSH white list ie. } /* Check if the world is already in the database. */ int is_word_exists( char *word, sqlite3_stmt **statement, sqlite3** conn ) { if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) { printf("Error binding to query select: %s\n", sqlite3_errmsg(*conn)); close_sqlitedb( &conn ); exit(1); } if ( sqlite3_step( *statement ) != SQLITE_DONE ) { sqlite3_reset( *statement ); return TRUE; } else { sqlite3_reset( *statement ); return FALSE; } } /* Add a word to the database. */ void add_to_the_list( char *word, sqlite3_stmt **statement, sqlite3** conn ) { if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) { printf("Error binding to query insert: %s\n", sqlite3_errmsg(*conn)); close_sqlitedb( &conn ); exit(1); } sqlite3_step( *statement ); sqlite3_reset( *statement ); } /* Increment the occurence of this word, if is already in the database. */ void inc_occ( char *word, sqlite3_stmt **statement, sqlite3** conn ) { // UPDATE dictionary SET global_occ = global_occ + 1 WHERE word=word // INSERT into dictionary ... if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) { printf("Error binding to query update: %s\n", sqlite3_errmsg(*conn)); close_sqlitedb( &conn ); exit(1); } sqlite3_step( *statement ); sqlite3_reset( *statement ); } void compute_freq( sqlite3** conn, float nbr_of_words, char *table ) { // UPDATE dictionary SET global_freq = global_occ / nbr_of_word // note that nbr_of_word must be a REAL type. /* For each words we compute : F = occ_of_word / nbr_of_words */ char sqlite3_query[MAXBUFFER]; sqlite3_stmt *statement; sprintf( sqlite3_query, "UPDATE %s SET global_freq = global_occ / %f", table, nbr_of_words); if ( sqlite3_prepare_v2(*conn, sqlite3_query, -1, &statement, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(*conn) ); close_sqlitedb( &conn ); exit(1); } sqlite3_step( statement ); sqlite3_finalize( statement ); } make_dictionary( char *team_list, char *database_name) { FILE *fp, *fp_wiki; float nbr_of_words = 0; char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD]; int size_name; sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update; sqlite3 *conn; if ( sqlite3_open( database_name, &conn ) != SQLITE_OK ) { printf("Error opening the db: %s\n", sqlite3_errmsg(conn)); exit(1); } fp = fopen(team_list, "r"); if ( fp == 0 ) { printf("Can't open %s file\n", team_list); exit(1); } // pre-compile the query to optimize the process if ( sqlite3_prepare_v2(conn, "SELECT * FROM dictionary WHERE word=?", -1, &stmt_select, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } if ( sqlite3_prepare_v2(conn, "INSERT INTO dictionary VALUES (?, 1, NULL)", -1, &stmt_insert, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } if ( sqlite3_prepare_v2(conn, "UPDATE dictionary \ SET global_occ = global_occ + 1 WHERE word=?", -1, &stmt_update, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } while( !feof(fp) ) { fgets( team_name, (MAXBUFFER - LEN_CMD), fp ); if ( feof(fp) ) { break; } size_name = strlen( team_name ); team_name[size_name - 1] = '\0'; chgchar( team_name, '-', '_', size_name); printf("On the %s Team\n", team_name); if ( chdir(team_name) == -1 ) { printf("error changing directory to %s\n", team_name); exit(1); } strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) ); strcat( digest_name, ".xtr"); fp_wiki = fopen( digest_name, "r" ); if( fp_wiki == 0 ) { printf("error when you open %s file", digest_name); exit(1); } if ( chdir("..") == -1 ) { printf("error changing directory to ..\n"); exit(1); } sqlite3_exec(conn, "begin", NULL, NULL, NULL ); while ( !feof(fp_wiki) ) { read_next_word( word, fp_wiki ); if ( is_word_valid( word ) == FALSE ) { continue; } if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) { add_to_the_list( word, &stmt_insert, &conn ); } else { inc_occ( word, &stmt_update, &conn ); } nbr_of_words++; } sqlite3_exec(conn, "commit", NULL, NULL, NULL ); fclose(fp_wiki); } fclose(fp); sqlite3_finalize( stmt_select ); sqlite3_finalize( stmt_insert ); sqlite3_finalize( stmt_update ); compute_freq( &conn, nbr_of_words, "dictionary" ); CLOSE_DB : if ( sqlite3_close(conn) != SQLITE_OK ) { printf("Error closing the db: %s\n", sqlite3_errmsg(conn)); } } void make_database( char *team_list, char *database_name ) { sqlite3 *conn; sqlite3_stmt *statement; FILE* fp; int size_name; char team_name[MAXBUFFER-LEN_CMD], sqlite_query[MAXBUFFER]; if ( sqlite3_open( database_name, &conn ) != SQLITE_OK ) { printf("Error opening the db: %s\n", sqlite3_errmsg(conn)); exit(1); } if ( sqlite3_prepare_v2(conn, "CREATE TABLE dictionary ( word text, \ global_occ int, global_freq real )", -1, &statement, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } sqlite3_step( statement ); sqlite3_finalize( statement ); fp = fopen(team_list, "r"); if ( fp == 0 ) { printf("Can't open %s file\n", team_list); exit(1); } while( !feof(fp) ) { fgets( team_name, (MAXBUFFER - LEN_CMD), fp ); if ( feof(fp) ) { break; } size_name = strlen( team_name ); team_name[size_name - 1] = '\0'; chgchar( team_name, '-', '_', size_name); printf("Create %s Team Table\n", team_name); sprintf( sqlite_query, "CREATE TABLE %s ( word text, global_occ int, \ global_freq real, sip real )", team_name); if ( sqlite3_prepare_v2(conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } sqlite3_step( statement ); sqlite3_reset( statement ); } fclose( fp); sqlite3_finalize( statement ); CLOSE_DB : if ( sqlite3_close(conn) != SQLITE_OK ) { printf("Error closing the db: %s\n", sqlite3_errmsg(conn)); } } void download_wiki( char *team_list, char* year ) { FILE *fp_src, *fp_tar, *fp; char ch, last_ch; char buffer[MAXBUFFER], team_name[MAXBUFFER-LEN_CMD], digest_name[MAXBUFFER-LEN_CMD]; int size_name; DIR *dp; struct dirent *entry; fp = fopen(team_list, "r"); if ( fp == 0 ) { printf("Can't open %s file\n", team_list); exit(1); } while( !feof(fp) ) { fgets( team_name, (MAXBUFFER - LEN_CMD), fp ); if ( feof(fp) ) { break; } size_name = strlen( team_name ); team_name[size_name - 1] = '\0'; chgchar( team_name, '-', '_', size_name); if ( mkdir(team_name, 0777) == -1 ) { printf("error in the mkdir\n"); exit(1); } if ( chdir(team_name) == -1 ) { printf("error changing the directory to %s\n", team_name); exit(1); } if ( strcmp( year, "2007") == 0 ) { snprintf(buffer, MAXBUFFER, "wget -R.jpg -R.png -R.gif \ -R.jpeg -RUser:* -E -l1 -nd -r \ http://parts.mit.edu/igem07/index.php/%s", team_name); printf("CMD : %s\n", buffer); system(buffer); } else { snprintf(buffer, MAXBUFFER, "wget -E -ITeam:%s -nd -r \ http://%s.igem.org/Team:%s", team_name, year, team_name); printf("CMD : %s\n", buffer); system(buffer); } if((dp = opendir(".")) == NULL) { printf("can't open the dir"); return; } while((entry = readdir(dp)) != NULL) { if ( isRep(entry->d_name) != IS_REP ) { if ( (strstr(entry->d_name, ".html") != NULL) && (strstr(entry->d_name, ".dumpx") == NULL) ) { snprintf(buffer, MAXBUFFER, "links -dump %s > %s.dumpx", entry->d_name, entry->d_name); system(buffer); } } } closedir(dp); snprintf(buffer, MAXBUFFER, "cat *.dumpx > %s", team_name); system(buffer); strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) ); strcat( digest_name, ".xtr"); fp_src = fopen(team_name, "r"); if ( fp == 0 ) { printf("Can't open %s file\n", team_name); exit(1); } fp_tar = fopen(digest_name, "w"); if ( fp == 0 ) { printf("Can't open %s file\n", digest_name); exit(1); } while( !feof(fp_src) ) { ch = fgetc(fp_src); if ( (ch >= 0x41 && ch <= 0x5A) || (ch >= 0x61 && ch <= 0x7A) || (ch == 0x20 && last_ch != 0x20) || ch == 0x2D || (ch >= 0x30 && ch <= 0x39) ) { if (ch >= 0x41 && ch <= 0x5A) { //put in lowercase ch += 0x20; } fputc(ch, fp_tar); } else { ch = 0x20; if ( last_ch != 0x20 ) { fputc(ch, fp_tar); } } last_ch = ch; } fclose( fp_src ); fclose( fp_tar ); if ( chdir("..") == -1 ) { printf("error changing the directory to %s\n", team_name); exit(1); } } fclose(fp); } void compute_sip( sqlite3 **conn, char *team_name ) { // SELECT * from TEAM_NAME; // pour chaque result -> get the freq | get the word => get the freq of word in all wiki // f/F // UPDATE where word='yourword' sqlite3_stmt *statement, *stmt_select, *stmt_update; char sqlite_query[MAXBUFFER]; float local_freq, global_freq; int res; snprintf( sqlite_query, MAXBUFFER, "SELECT * from %s", team_name); if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK ) { printf("compute_sip() : Error compiling the request 1: %s\n", sqlite3_errmsg(*conn) ); close_sqlitedb( &conn ); exit(1); } if ( sqlite3_prepare_v2(*conn, "SELECT * from dictionary WHERE word=?", -1, &stmt_select, NULL ) != SQLITE_OK ) { printf("compute_sip() : Error compiling the request 2: %s\n", sqlite3_errmsg(*conn) ); close_sqlitedb( &conn ); exit(1); } snprintf( sqlite_query, MAXBUFFER, "UPDATE %s SET sip = ?/? WHERE word=?", team_name); if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &stmt_update, NULL ) != SQLITE_OK ) { printf("compute_sip() : Error compiling the request 3: %s\n", sqlite3_errmsg(*conn) ); close_sqlitedb( &conn ); exit(1); } sqlite3_exec(*conn, "begin", NULL, NULL, NULL ); while( (res = sqlite3_step(statement)) == SQLITE_ROW) { local_freq = (float)sqlite3_column_double(statement, 2); if ( sqlite3_bind_text( stmt_select, 1, sqlite3_column_text(statement, 0), -1, SQLITE_STATIC) != SQLITE_OK ) { printf("compute_sip() : Error binding 1: %s\n", sqlite3_errmsg(*conn)); close_sqlitedb( &conn ); exit(1); } sqlite3_step( stmt_select ); global_freq = (float)sqlite3_column_double(stmt_select, 2); if ( sqlite3_bind_double( stmt_update, 1, (double)local_freq) != SQLITE_OK ) { printf("compute_sip() : Error binding 2: %s\n", sqlite3_errmsg(*conn)); close_sqlitedb( &conn ); exit(1); } if ( sqlite3_bind_double( stmt_update, 2, (double)global_freq) != SQLITE_OK ) { printf("compute_sip() : Error binding 3: %s\n", sqlite3_errmsg(*conn)); close_sqlitedb( &conn ); exit(1); } if ( sqlite3_bind_text( stmt_update, 3, sqlite3_column_text(statement, 0), -1, SQLITE_STATIC) != SQLITE_OK ) { printf("compute_sip() : Error binding 4: %s\n", sqlite3_errmsg(*conn)); close_sqlitedb( &conn ); exit(1); } sqlite3_step( stmt_update ); sqlite3_reset( stmt_select ); sqlite3_reset( stmt_update ); } sqlite3_exec(*conn, "commit", NULL, NULL, NULL ); sqlite3_finalize( statement ); sqlite3_finalize( stmt_select ); sqlite3_finalize( stmt_update ); } void make_sipword( char *team_list, char *database_name) { FILE *fp_wiki, *fp; float nbr_of_words = 0; char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD]; char sqlite_query[MAXBUFFER]; int size_name; sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update; sqlite3 *conn; if ( sqlite3_open( database_name, &conn ) != SQLITE_OK ) { printf("Error opening the db: %s\n", sqlite3_errmsg(conn)); exit(1); } fp = fopen(team_list, "r"); if ( fp == 0 ) { printf("Can't open %s file\n", team_list); exit(1); } while( !feof(fp) ) { fgets( team_name, (MAXBUFFER - LEN_CMD), fp ); if ( feof(fp) ) { break; } size_name = strlen( team_name ); team_name[size_name - 1] = '\0'; chgchar( team_name, '-', '_', size_name); printf("On the %s Team\n", team_name); // pre-compile the query to optimize the process snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s WHERE word=?", team_name); if ( sqlite3_prepare_v2(conn, sqlite_query , -1, &stmt_select, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } snprintf( sqlite_query, MAXBUFFER, "INSERT INTO %s \ VALUES (?, 1, NULL, NULL)", team_name); if ( sqlite3_prepare_v2(conn, sqlite_query, -1, &stmt_insert, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } snprintf( sqlite_query, MAXBUFFER, "UPDATE %s \ SET global_occ = global_occ + 1 WHERE word=?", team_name); if ( sqlite3_prepare_v2(conn, sqlite_query, -1, &stmt_update, NULL ) != SQLITE_OK ) { printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } if ( chdir(team_name) == -1 ) { printf("error changing directory to %s\n", team_name); exit(1); } strncpy( digest_name, team_name, MAXBUFFER-LEN_CMD-4 ); strcat( digest_name, ".xtr"); fp_wiki = fopen( digest_name, "r" ); if( fp_wiki == 0 ) { printf("error when you open %s file", digest_name); exit(1); } if ( chdir("..") == -1 ) { printf("error changing directory to ..\n"); exit(1); } sqlite3_exec(conn, "begin", NULL, NULL, NULL ); while ( !feof(fp_wiki) ) { read_next_word( word, fp_wiki ); if ( is_word_valid( word ) == FALSE ) { continue; } if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) { add_to_the_list( word, &stmt_insert, &conn ); } else { inc_occ( word, &stmt_update, &conn ); } nbr_of_words++; } sqlite3_exec(conn, "commit", NULL, NULL, NULL ); fclose(fp_wiki); sqlite3_finalize( stmt_select ); sqlite3_finalize( stmt_insert ); sqlite3_finalize( stmt_update ); compute_freq( &conn, nbr_of_words, team_name ); compute_sip( &conn, team_name); } CLOSE_DB : if ( sqlite3_close(conn) != SQLITE_OK ) { printf("Error closing the db: %s\n", sqlite3_errmsg(conn)); } } int main( int argc, char *argv[] ) { if ( argv[TEAM_NAME] == 0 || argv[DATABASE_NAME] == 0 ) { printf("usage : %s [list of team name] [name of database] [year]\n \ - don't put [year] if you want to skip the download step.\n", argv[0] ); exit(1); } if ( argv[YEAR] == 0 ) { printf("skip the downlad step !\n"); } else { printf("start to download wiki !\n"); download_wiki( argv[TEAM_NAME], argv[YEAR] ); } printf("make the database !\n"); make_database( argv[TEAM_NAME], argv[DATABASE_NAME] ); printf("start to make a dictionary !\n"); make_dictionary( argv[TEAM_NAME], argv[DATABASE_NAME] ); printf("start calculate SIP words !\n"); make_sipword( argv[TEAM_NAME], argv[DATABASE_NAME] ); return 0; }
TOP-SIP.C
/* TOP_SIP.C *** Give the top hight SIP words PUBLIC DOMAIN From iGEM team 2010 Paris comment: This code allows you to get the words with the hight SIP value Use SIP.C before using this tools. For the random module, you can use what you want, we use MersenneTwister written in C++, that's why you have to compile with g++ in this case. MersenneTwister : http://www-personal.umich.edu/~wagnerr/MersenneTwister.html build: $ g++ -o top-sip top-sip.c -lsqlite3 usage : $ ./top-sip [list of team name] [name of database] [coeff] */ #include <stdio.h> #include <stdlib.h> #include <string.h> #include <dirent.h> #include <sqlite3.h> #include "MersenneTwister.hpp" #define MAXBUFFER 256 #define LEN_CMD 96 #define TEAM_NAME 1 #define DATABASE_NAME 2 #define DEC_COEFF 3 #define FALSE 0 #define TRUE 1 #define MIN_COEFF 0 #define MAX_COEFF 15 #define WORD_C 0 #define FREQ_C 2 #define SIP_C 3 // arbitrary multiplicator #define MUL 100000 //number of words you want #define TOP 25 /* Change a character by an other one in a string. @string : your string @c: char to remove @r : char to set instead @len : lenght of the string */ void chgchar( char *string, char c, char r, int len ) { int i = 0; while ( i < len ) { if ( string[i] == c ) { string[i] = r; i++; } else i++; } } /* pick a random number between 0 and X, use your favorite random algorithm ! I use Mersenne Twister. @prob : your X, so you have 1/X prob to get the good number. X+1 numbers can be choosen. return : the random number */ int pickrandom( int prob ) { MTRand mtrand1; int d = mtrand1.randInt( prob ); return d; } void compute_top_sip( char *team_list, char *database_name, float dec_coeff ) { FILE *fp, *fp_team; sqlite3_stmt *statement; char sqlite_query[MAXBUFFER]; char top_words[TOP][MAXBUFFER]; char team_name[MAXBUFFER-LEN_CMD]; int wanted_stuff, total_stuff, prob, i; unsigned long int t; int size_name, index; double top_freq[TOP]; double max_sip, curr_sip; sqlite3 *conn; if ( sqlite3_open( database_name, &conn ) != SQLITE_OK ) { printf("Error opening the db: %s\n", sqlite3_errmsg(conn)); exit(1); } fp = fopen(team_list, "r"); if ( fp == 0 ) { printf("Can't open %s file\n", team_list); exit(1); } // for each team while( !feof(fp) ) { fgets( team_name, (MAXBUFFER - LEN_CMD), fp ); if ( feof(fp) ) { break; } size_name = strlen( team_name ); team_name[size_name - 1] = '\0'; chgchar( team_name, '-', '_', size_name); printf("On the %s Team\n", team_name); max_sip = 0; snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s", team_name); if ( sqlite3_prepare_v2(conn, sqlite_query , -1, &statement, NULL ) != SQLITE_OK ) { printf("COMPUTE HIGHT SIP : Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } t = 0; while ( sqlite3_step( statement ) != SQLITE_DONE ) { curr_sip = (double)sqlite3_column_double(statement, SIP_C); if ( curr_sip > max_sip ) { max_sip = curr_sip; } t++; } printf("hight sip is %.15f, for %ld entry\n", max_sip, t); sqlite3_finalize( statement ); PICK_TOP: index = 0; prob = 0; wanted_stuff = TOP; total_stuff = 0; snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s WHERE sip > %.15f", team_name, max_sip); if ( sqlite3_prepare_v2(conn, sqlite_query , -1, &statement, NULL ) != SQLITE_OK ) { printf("PICK TOP FIVE : Error compiling the request: %s\n", sqlite3_errmsg(conn) ); goto CLOSE_DB; exit(1); } while ( sqlite3_step( statement ) != SQLITE_DONE ) { total_stuff++; } if( total_stuff < TOP ) { sqlite3_finalize( statement ); max_sip = max_sip - dec_coeff; if ( max_sip <= 0 ) { continue; } goto PICK_TOP; } prob = (int)total_stuff / wanted_stuff; // we put the fraction to 1/X and we save X. printf("prob is : %f, (%d/%d), give a number between 0 and %d\n", (float)wanted_stuff / total_stuff, TOP, total_stuff, prob); prob--; while ( sqlite3_step( statement ) != SQLITE_DONE ) { if ( pickrandom(prob) == FALSE ) { strncpy( top_words[index], (const char*)sqlite3_column_text(statement, WORD_C), MAXBUFFER); top_freq[index] = (double)sqlite3_column_double(statement, FREQ_C); index++; if ( index == TOP ) { break; } } } printf("Your top-words for this team is :\n"); fp_team = fopen(team_name, "w"); for( index=0;index<TOP;index++) { printf("\t- %s (freq = %f; len=%d)\n", top_words[index], (float)(top_freq[index] * MUL), (int)(top_freq[index] * MUL)); fprintf(fp_team, "%s: %d\n", top_words[index], occ); } fclose( fp_team ); sqlite3_finalize( statement ); } fclose(fp); CLOSE_DB : if ( sqlite3_close(conn) != SQLITE_OK ) { printf("Error closing the db: %s\n", sqlite3_errmsg(conn)); } } int main( int argc, char *argv[] ) { if ( argv[TEAM_NAME] == 0 || argv[DATABASE_NAME] == 0 || argv[DEC_COEFF] == 0 ) { printf("usage : %s [list of team name] [name of database] [dec-coeff]\n", argv[0] ); printf("/tUse 0 to 15 to define a dec-coeff.\n"); exit(1); } if ( atoi( argv[DEC_COEFF]) >= MIN_COEFF && atoi( argv[DEC_COEFF]) <= MAX_COEFF ) { printf("Compute with 10^-%s dec_coeff!\n", argv[DEC_COEFF]); compute_top_sip( argv[TEAM_NAME], argv[DATABASE_NAME], powf( 10, -atoi(argv[DEC_COEFF])) ); } else { printf("Use 0 to 15 to define a dec-coeff.\n"); } return 0; }