|
|
Line 7: |
Line 7: |
| <img src="https://static.igem.org/mediawiki/2010/4/4c/SIP.png" width="75" height="75" title="SIP"> | | <img src="https://static.igem.org/mediawiki/2010/4/4c/SIP.png" width="75" height="75" title="SIP"> |
| </a> | | </a> |
- | <font size=4>SIP Wiki Analyser : Find iGEM winner with satistics</font> | + | <font size=4>SIP Wiki Analyser </font> |
| <a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Project/Population_counter"> | | <a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Project/Population_counter"> |
| <img src="https://static.igem.org/mediawiki/2010/3/30/Popcount.png" width="75" height="75" align=right title="Population Counter"> | | <img src="https://static.igem.org/mediawiki/2010/3/30/Popcount.png" width="75" height="75" align=right title="Population Counter"> |
Line 30: |
Line 30: |
| </div> | | </div> |
| | | |
- | <br /><br /> | + | <br /><br /><br /> |
| </p> | | </p> |
| </html> | | </html> |
| + | |
| + | == Find iGEM winner with satistics == |
| | | |
| <p style="display:block"> | | <p style="display:block"> |
| <br /> | | <br /> |
| + | |
| So this year, iGEM team Paris try to find '''who will win the iGEM competition'''. We have several aproach to find that, using data on the wikis.<br /> | | So this year, iGEM team Paris try to find '''who will win the iGEM competition'''. We have several aproach to find that, using data on the wikis.<br /> |
| To analyse the wiki, we have implemented an algorithm called '''SIP''' ('''Statistically Improbable Phrases'''), used by the Amazon website to caracterize their books. We try to find which are the most improbable words in a large sample (we use all iGEM wiki) but the most probable in the context. | | To analyse the wiki, we have implemented an algorithm called '''SIP''' ('''Statistically Improbable Phrases'''), used by the Amazon website to caracterize their books. We try to find which are the most improbable words in a large sample (we use all iGEM wiki) but the most probable in the context. |
Line 58: |
Line 61: |
| </p> | | </p> |
| | | |
- | == SIP.C ==
| |
- | <p style="display:block">
| |
- | <pre>
| |
- | /* SIP.C *** Make SIP database and dictionary
| |
- |
| |
- | PUBLIC DOMAIN
| |
- | From iGEM team 2010 Paris
| |
- |
| |
- | comment:
| |
- |
| |
- | This code permit you to calculate SIP words (most improbable words) in each
| |
- | wiki team.
| |
- | You need sqlite3, wget and links to run this program.
| |
- |
| |
- | build:
| |
- |
| |
- | $ gcc -o sip sip.c -lsqlite3
| |
- |
| |
- | usage :
| |
- |
| |
- | $ ./sip [list of team name] [name of database] [year]
| |
- |
| |
- |
| |
- | Sqlite3 database :
| |
- |
| |
- | Table for team.
| |
- | +--------+-----------+------------+-----------+
| |
- | | Words | local occ | local freq | SIP value |
| |
- | +--------+-----------+------------+-----------+
| |
- | | string | u_long | float | float |
| |
- | | | | | |
| |
- | +--------+-----------+------------+-----------+
| |
- |
| |
- |
| |
- | Table name : Dictionary.
| |
- | +--------+------------+-------------+
| |
- | | word | global_occ | global_freq |
| |
- | +--------+------------+-------------+
| |
- | | string | u_long | float |
| |
- | | | | |
| |
- | +--------+------------+-------------+
| |
- |
| |
- | */
| |
- |
| |
- | #include <stdio.h>
| |
- | #include <stdlib.h>
| |
- | #include <string.h>
| |
- | #include <dirent.h>
| |
- | #include <sqlite3.h>
| |
- |
| |
- | #define MAXBUFFER 256
| |
- | #define LEN_CMD 96
| |
- |
| |
- | #define TEAM_NAME 1
| |
- | #define DATABASE_NAME 2
| |
- | #define YEAR 3
| |
- |
| |
- | #define FALSE 0
| |
- | #define TRUE 1
| |
- |
| |
- | #define IS_REP 1
| |
- | #define NO_REP 2
| |
- |
| |
- | /*
| |
- | isRep say if the stream is a directory or a file.
| |
- | @entryname : name of the file
| |
- | return : IS_REP it's a directory, NO_REP it's a file
| |
- | */
| |
- | int isRep( char *entryname )
| |
- | {
| |
- | FILE *fp;
| |
- | fp = fopen( entryname, "rb" );
| |
- |
| |
- | if ( fp == 0 )
| |
- | return IS_REP;
| |
- | else {
| |
- | fclose( fp );
| |
- | return NO_REP; }
| |
- | }
| |
- |
| |
- | void chgchar( char *string, char c, char r, int len )
| |
- | {
| |
- | int i = 0;
| |
- |
| |
- | while ( i < len )
| |
- | {
| |
- | if ( string[i] == c )
| |
- | {
| |
- |
| |
- | string[i] = r;
| |
- | i++;
| |
- | }
| |
- | else
| |
- | i++;
| |
- | }
| |
- | }
| |
- |
| |
- | int close_sqlitedb( sqlite3*** conn )
| |
- | {
| |
- | if ( sqlite3_close(**conn) != SQLITE_OK )
| |
- | {
| |
- | printf("Error closing the db: %s\n", sqlite3_errmsg(*conn));
| |
- | }
| |
- | }
| |
- |
| |
- | void read_next_word( char word[MAXBUFFER], FILE *fp )
| |
- | {
| |
- | char buffer[MAXBUFFER];
| |
- | char ch;
| |
- | int n = 0;
| |
- |
| |
- | ch = fgetc(fp);
| |
- |
| |
- | if( ch == EOF ) {
| |
- | return;
| |
- | }
| |
- |
| |
- | while ( ch != 0x20 && ch != EOF && n < MAXBUFFER )
| |
- | {
| |
- | buffer[n] = ch;
| |
- | n++;
| |
- | ch = fgetc(fp);
| |
- | }
| |
- | buffer[n] = '\0';
| |
- | strcpy( word, buffer );
| |
- | }
| |
- |
| |
- | int is_word_valid( char word )
| |
- | {
| |
- | // there's no filter at the moment.
| |
- | // We can compare with MeSH white list ie.
| |
- | }
| |
- |
| |
- | int is_word_exists( char *word, sqlite3_stmt **statement, sqlite3** conn )
| |
- | {
| |
- | if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
| |
- | printf("Error binding to query select: %s\n", sqlite3_errmsg(*conn));
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( sqlite3_step( *statement ) != SQLITE_DONE ) {
| |
- | sqlite3_reset( *statement );
| |
- | return TRUE;
| |
- | } else {
| |
- | sqlite3_reset( *statement );
| |
- | return FALSE;
| |
- | }
| |
- |
| |
- | }
| |
- |
| |
- | void add_to_the_list( char *word, sqlite3_stmt **statement, sqlite3** conn )
| |
- | {
| |
- | if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
| |
- | printf("Error binding to query insert: %s\n", sqlite3_errmsg(*conn));
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | sqlite3_step( *statement );
| |
- | sqlite3_reset( *statement );
| |
- | }
| |
- |
| |
- | void inc_occ( char *word, sqlite3_stmt **statement, sqlite3** conn )
| |
- | {
| |
- | // UPDATE dictionary SET global_occ = global_occ + 1 WHERE word=word
| |
- | // INSERT into dictionary ...
| |
- | if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
| |
- | printf("Error binding to query update: %s\n", sqlite3_errmsg(*conn));
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | sqlite3_step( *statement );
| |
- | sqlite3_reset( *statement );
| |
- | }
| |
- |
| |
- | void compute_freq( sqlite3** conn, float nbr_of_words, char *table )
| |
- | {
| |
- | // UPDATE dictionary SET global_freq = global_occ / nbr_of_word
| |
- | // note that nbr_of_word must be a REAL type.
| |
- | /* For each words we compute : F = occ_of_word / nbr_of_words */
| |
- | char sqlite3_query[MAXBUFFER];
| |
- | sqlite3_stmt *statement;
| |
- |
| |
- | sprintf( sqlite3_query, "UPDATE %s SET global_freq = global_occ / %f",
| |
- | table, nbr_of_words);
| |
- |
| |
- | if ( sqlite3_prepare_v2(*conn, sqlite3_query, -1, &statement, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(*conn) );
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- | sqlite3_step( statement );
| |
- | sqlite3_finalize( statement );
| |
- | }
| |
- |
| |
- | make_dictionary( char *team_list, char *database_name)
| |
- | {
| |
- | FILE *fp, *fp_wiki;
| |
- | float nbr_of_words = 0;
| |
- | char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD];
| |
- | int size_name;
| |
- |
| |
- | sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update;
| |
- |
| |
- | sqlite3 *conn;
| |
- | if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | fp = fopen(team_list, "r");
| |
- |
| |
- | if ( fp == 0 ) {
| |
- | printf("Can't open %s file\n", team_list);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | // pre-compile the query to optimize the process
| |
- | if ( sqlite3_prepare_v2(conn, "SELECT * FROM dictionary WHERE word=?", -1,
| |
- | &stmt_select, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( sqlite3_prepare_v2(conn, "INSERT INTO dictionary VALUES (?, 1, NULL)", -1,
| |
- | &stmt_insert, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( sqlite3_prepare_v2(conn, "UPDATE dictionary \
| |
- | SET global_occ = global_occ + 1 WHERE word=?",
| |
- | -1, &stmt_update, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | while( !feof(fp) )
| |
- | {
| |
- | fgets( team_name, (MAXBUFFER - LEN_CMD), fp );
| |
- |
| |
- | if ( feof(fp) ) {
| |
- | break;
| |
- | }
| |
- |
| |
- | size_name = strlen( team_name );
| |
- | team_name[size_name - 1] = '\0';
| |
- | chgchar( team_name, '-', '_', size_name);
| |
- | printf("On the %s Team\n", team_name);
| |
- |
| |
- | if ( chdir(team_name) == -1 ) {
| |
- | printf("error changing directory to %s\n", team_name);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) );
| |
- | strcat( digest_name, ".xtr");
| |
- |
| |
- | fp_wiki = fopen( digest_name, "r" );
| |
- | if( fp_wiki == 0 ) {
| |
- | printf("error when you open %s file", digest_name);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( chdir("..") == -1 ) {
| |
- | printf("error changing directory to ..\n");
| |
- | exit(1);
| |
- | }
| |
- |
| |
- |
| |
- | sqlite3_exec(conn, "begin", NULL, NULL, NULL );
| |
- | while ( !feof(fp_wiki) )
| |
- | {
| |
- | read_next_word( word, fp_wiki );
| |
- | if ( is_word_valid( word ) == FALSE ) {
| |
- | continue;
| |
- | }
| |
- | if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) {
| |
- | add_to_the_list( word, &stmt_insert, &conn );
| |
- | }
| |
- | else {
| |
- | inc_occ( word, &stmt_update, &conn );
| |
- | }
| |
- |
| |
- | nbr_of_words++;
| |
- | }
| |
- | sqlite3_exec(conn, "commit", NULL, NULL, NULL );
| |
- | fclose(fp_wiki);
| |
- | }
| |
- | fclose(fp);
| |
- | sqlite3_finalize( stmt_select );
| |
- | sqlite3_finalize( stmt_insert );
| |
- | sqlite3_finalize( stmt_update );
| |
- |
| |
- | compute_freq( &conn, nbr_of_words, "dictionary" );
| |
- |
| |
- | CLOSE_DB :
| |
- | if ( sqlite3_close(conn) != SQLITE_OK ) {
| |
- | printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
| |
- | }
| |
- | }
| |
- |
| |
- | void make_database( char *team_list, char *database_name )
| |
- | {
| |
- | sqlite3 *conn;
| |
- | sqlite3_stmt *statement;
| |
- | FILE* fp;
| |
- | int size_name;
| |
- | char team_name[MAXBUFFER-LEN_CMD], sqlite_query[MAXBUFFER];
| |
- |
| |
- | if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( sqlite3_prepare_v2(conn, "CREATE TABLE dictionary ( word text, \
| |
- | global_occ int, global_freq real )", -1, &statement, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | sqlite3_step( statement );
| |
- | sqlite3_finalize( statement );
| |
- |
| |
- | fp = fopen(team_list, "r");
| |
- |
| |
- | if ( fp == 0 ) {
| |
- | printf("Can't open %s file\n", team_list);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | while( !feof(fp) )
| |
- | {
| |
- | fgets( team_name, (MAXBUFFER - LEN_CMD), fp );
| |
- |
| |
- | if ( feof(fp) ) {
| |
- | break;
| |
- | }
| |
- |
| |
- | size_name = strlen( team_name );
| |
- | team_name[size_name - 1] = '\0';
| |
- | chgchar( team_name, '-', '_', size_name);
| |
- | printf("Create %s Team Table\n", team_name);
| |
- |
| |
- | sprintf( sqlite_query, "CREATE TABLE %s ( word text, global_occ int, \
| |
- | global_freq real, sip real )", team_name);
| |
- | if ( sqlite3_prepare_v2(conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | sqlite3_step( statement );
| |
- | sqlite3_reset( statement );
| |
- |
| |
- | }
| |
- | fclose( fp);
| |
- | sqlite3_finalize( statement );
| |
- |
| |
- | CLOSE_DB :
| |
- | if ( sqlite3_close(conn) != SQLITE_OK ) {
| |
- | printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
| |
- | }
| |
- |
| |
- | }
| |
- |
| |
- | void download_wiki( char *team_list, char* year )
| |
- | {
| |
- | FILE *fp_src, *fp_tar, *fp;
| |
- | char ch, last_ch;
| |
- | char buffer[MAXBUFFER], team_name[MAXBUFFER-LEN_CMD], digest_name[MAXBUFFER-LEN_CMD];
| |
- | int size_name;
| |
- | DIR *dp;
| |
- | struct dirent *entry;
| |
- |
| |
- | fp = fopen(team_list, "r");
| |
- |
| |
- | if ( fp == 0 ) {
| |
- | printf("Can't open %s file\n", team_list);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | while( !feof(fp) )
| |
- | {
| |
- | fgets( team_name, (MAXBUFFER - LEN_CMD), fp );
| |
- |
| |
- | if ( feof(fp) ) {
| |
- | break;
| |
- | }
| |
- |
| |
- | size_name = strlen( team_name );
| |
- | team_name[size_name - 1] = '\0';
| |
- |
| |
- | chgchar( team_name, '-', '_', size_name);
| |
- |
| |
- | if ( mkdir(team_name, 0777) == -1 ) {
| |
- | printf("error in the mkdir\n");
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( chdir(team_name) == -1 ) {
| |
- | printf("error changing the directory to %s\n", team_name);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( strcmp( year, "2007") == 0 ) {
| |
- | snprintf(buffer, MAXBUFFER, "wget -R.jpg -R.png -R.gif \
| |
- | -R.jpeg -RUser:* -E -l1 -nd -r \
| |
- | http://parts.mit.edu/igem07/index.php/%s", team_name);
| |
- | printf("CMD : %s\n", buffer);
| |
- | system(buffer);
| |
- | } else {
| |
- | snprintf(buffer, MAXBUFFER, "wget -E -ITeam:%s -nd -r \
| |
- | http://%s.igem.org/Team:%s", team_name, year, team_name);
| |
- | printf("CMD : %s\n", buffer);
| |
- | system(buffer);
| |
- | }
| |
- |
| |
- | if((dp = opendir(".")) == NULL) {
| |
- | printf("can't open the dir");
| |
- | return;
| |
- | }
| |
- |
| |
- | while((entry = readdir(dp)) != NULL) {
| |
- | if ( isRep(entry->d_name) != IS_REP ) {
| |
- | if ( (strstr(entry->d_name, ".html") != NULL)
| |
- | && (strstr(entry->d_name, ".dumpx") == NULL) ) {
| |
- | snprintf(buffer, MAXBUFFER, "links -dump %s > %s.dumpx",
| |
- | entry->d_name, entry->d_name);
| |
- | system(buffer);
| |
- | }
| |
- | }
| |
- | }
| |
- | closedir(dp);
| |
- | snprintf(buffer, MAXBUFFER, "cat *.dumpx > %s", team_name);
| |
- | system(buffer);
| |
- |
| |
- | strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) );
| |
- | strcat( digest_name, ".xtr");
| |
- |
| |
- | fp_src = fopen(team_name, "r");
| |
- | if ( fp == 0 ) {
| |
- | printf("Can't open %s file\n", team_name);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | fp_tar = fopen(digest_name, "w");
| |
- | if ( fp == 0 ) {
| |
- | printf("Can't open %s file\n", digest_name);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | while( !feof(fp_src) )
| |
- | {
| |
- | ch = fgetc(fp_src);
| |
- |
| |
- | if ( (ch >= 0x41 && ch <= 0x5A) ||
| |
- | (ch >= 0x61 && ch <= 0x7A) ||
| |
- | (ch == 0x20 && last_ch != 0x20) ||
| |
- | ch == 0x2D ||
| |
- | (ch >= 0x30 && ch <= 0x39) ) {
| |
- | if (ch >= 0x41 && ch <= 0x5A) { //put in lowercase
| |
- | ch += 0x20;
| |
- | }
| |
- | fputc(ch, fp_tar);
| |
- | }
| |
- | else {
| |
- | ch = 0x20;
| |
- |
| |
- | if ( last_ch != 0x20 ) {
| |
- | fputc(ch, fp_tar);
| |
- | }
| |
- | }
| |
- | last_ch = ch;
| |
- | }
| |
- |
| |
- | fclose( fp_src );
| |
- | fclose( fp_tar );
| |
- |
| |
- | if ( chdir("..") == -1 ) {
| |
- | printf("error changing the directory to %s\n", team_name);
| |
- | exit(1);
| |
- | }
| |
- | }
| |
- | fclose(fp);
| |
- | }
| |
- |
| |
- | void compute_sip( sqlite3 **conn, char *team_name )
| |
- | {
| |
- | // SELECT * from TEAM_NAME;
| |
- | // pour chaque result -> get the freq | get the word => get the freq of word in all wiki
| |
- | // f/F
| |
- | // UPDATE where word='yourword'
| |
- |
| |
- | sqlite3_stmt *statement, *stmt_select, *stmt_update;
| |
- | char sqlite_query[MAXBUFFER];
| |
- | float local_freq, global_freq;
| |
- | int res;
| |
- |
| |
- | snprintf( sqlite_query, MAXBUFFER, "SELECT * from %s", team_name);
| |
- | if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK ) {
| |
- | printf("compute_sip() : Error compiling the request 1: %s\n",
| |
- | sqlite3_errmsg(*conn) );
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( sqlite3_prepare_v2(*conn, "SELECT * from dictionary WHERE word=?", -1,
| |
- | &stmt_select, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("compute_sip() : Error compiling the request 2: %s\n",
| |
- | sqlite3_errmsg(*conn) );
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | snprintf( sqlite_query, MAXBUFFER, "UPDATE %s SET sip = ?/? WHERE word=?",
| |
- | team_name);
| |
- | if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &stmt_update, NULL ) != SQLITE_OK ) {
| |
- | printf("compute_sip() : Error compiling the request 3: %s\n",
| |
- | sqlite3_errmsg(*conn) );
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | sqlite3_exec(*conn, "begin", NULL, NULL, NULL );
| |
- | while( (res = sqlite3_step(statement)) == SQLITE_ROW)
| |
- | {
| |
- | local_freq = (float)sqlite3_column_double(statement, 2);
| |
- |
| |
- | if ( sqlite3_bind_text( stmt_select, 1, sqlite3_column_text(statement, 0), -1,
| |
- | SQLITE_STATIC) != SQLITE_OK ) {
| |
- | printf("compute_sip() : Error binding 1: %s\n", sqlite3_errmsg(*conn));
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- | sqlite3_step( stmt_select );
| |
- | global_freq = (float)sqlite3_column_double(stmt_select, 2);
| |
- |
| |
- | if ( sqlite3_bind_double( stmt_update, 1, (double)local_freq) != SQLITE_OK ) {
| |
- | printf("compute_sip() : Error binding 2: %s\n", sqlite3_errmsg(*conn));
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( sqlite3_bind_double( stmt_update, 2, (double)global_freq) != SQLITE_OK ) {
| |
- | printf("compute_sip() : Error binding 3: %s\n", sqlite3_errmsg(*conn));
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( sqlite3_bind_text( stmt_update, 3, sqlite3_column_text(statement, 0), -1,
| |
- | SQLITE_STATIC) != SQLITE_OK ) {
| |
- | printf("compute_sip() : Error binding 4: %s\n", sqlite3_errmsg(*conn));
| |
- | close_sqlitedb( &conn );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | sqlite3_step( stmt_update );
| |
- |
| |
- | sqlite3_reset( stmt_select );
| |
- | sqlite3_reset( stmt_update );
| |
- | }
| |
- | sqlite3_exec(*conn, "commit", NULL, NULL, NULL );
| |
- |
| |
- | sqlite3_finalize( statement );
| |
- | sqlite3_finalize( stmt_select );
| |
- | sqlite3_finalize( stmt_update );
| |
- | }
| |
- |
| |
- | void make_sipword( char *team_list, char *database_name)
| |
- | {
| |
- | FILE *fp_wiki, *fp;
| |
- | float nbr_of_words = 0;
| |
- | char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD];
| |
- | char sqlite_query[MAXBUFFER];
| |
- | int size_name;
| |
- |
| |
- | sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update;
| |
- |
| |
- | sqlite3 *conn;
| |
- | if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | fp = fopen(team_list, "r");
| |
- |
| |
- | if ( fp == 0 ) {
| |
- | printf("Can't open %s file\n", team_list);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | while( !feof(fp) )
| |
- | {
| |
- | fgets( team_name, (MAXBUFFER - LEN_CMD), fp );
| |
- |
| |
- | if ( feof(fp) ) {
| |
- | break;
| |
- | }
| |
- |
| |
- | size_name = strlen( team_name );
| |
- | team_name[size_name - 1] = '\0';
| |
- | chgchar( team_name, '-', '_', size_name);
| |
- |
| |
- | printf("On the %s Team\n", team_name);
| |
- |
| |
- | // pre-compile the query to optimize the process
| |
- | snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s WHERE word=?",
| |
- | team_name);
| |
- |
| |
- | if ( sqlite3_prepare_v2(conn, sqlite_query , -1,
| |
- | &stmt_select, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | snprintf( sqlite_query, MAXBUFFER, "INSERT INTO %s \
| |
- | VALUES (?, 1, NULL, NULL)", team_name);
| |
- |
| |
- | if ( sqlite3_prepare_v2(conn, sqlite_query, -1,
| |
- | &stmt_insert, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | snprintf( sqlite_query, MAXBUFFER, "UPDATE %s \
| |
- | SET global_occ = global_occ + 1 WHERE word=?", team_name);
| |
- |
| |
- | if ( sqlite3_prepare_v2(conn, sqlite_query, -1,
| |
- | &stmt_update, NULL ) != SQLITE_OK )
| |
- | {
| |
- | printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
| |
- | goto CLOSE_DB;
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( chdir(team_name) == -1 ) {
| |
- | printf("error changing directory to %s\n", team_name);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | strncpy( digest_name, team_name, MAXBUFFER-LEN_CMD-4 );
| |
- | strcat( digest_name, ".xtr");
| |
- |
| |
- | fp_wiki = fopen( digest_name, "r" );
| |
- | if( fp_wiki == 0 ) {
| |
- | printf("error when you open %s file", digest_name);
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( chdir("..") == -1 ) {
| |
- | printf("error changing directory to ..\n");
| |
- | exit(1);
| |
- | }
| |
- |
| |
- |
| |
- | sqlite3_exec(conn, "begin", NULL, NULL, NULL );
| |
- | while ( !feof(fp_wiki) )
| |
- | {
| |
- | read_next_word( word, fp_wiki );
| |
- | if ( is_word_valid( word ) == FALSE ) {
| |
- | continue;
| |
- | }
| |
- | if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) {
| |
- | add_to_the_list( word, &stmt_insert, &conn );
| |
- | }
| |
- | else {
| |
- | inc_occ( word, &stmt_update, &conn );
| |
- | }
| |
- |
| |
- | nbr_of_words++;
| |
- | }
| |
- | sqlite3_exec(conn, "commit", NULL, NULL, NULL );
| |
- | fclose(fp_wiki);
| |
- | sqlite3_finalize( stmt_select );
| |
- | sqlite3_finalize( stmt_insert );
| |
- | sqlite3_finalize( stmt_update );
| |
- |
| |
- | compute_freq( &conn, nbr_of_words, team_name );
| |
- | compute_sip( &conn, team_name);
| |
- | }
| |
- |
| |
- | CLOSE_DB :
| |
- | if ( sqlite3_close(conn) != SQLITE_OK ) {
| |
- | printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
| |
- | }
| |
- | }
| |
- |
| |
- | int main( int argc, char *argv[] )
| |
- | {
| |
- |
| |
- | if ( argv[TEAM_NAME] == 0 || argv[DATABASE_NAME] == 0 ) {
| |
- | printf("usage : %s [list of team name] [name of database] [year]\n \
| |
- | - don't put [year] if you want to skip the download step.\n", argv[0] );
| |
- | exit(1);
| |
- | }
| |
- |
| |
- | if ( argv[YEAR] == 0 ) {
| |
- | printf("skip the downlad step !\n");
| |
- | } else {
| |
- | printf("start to download wiki !\n");
| |
- | download_wiki( argv[TEAM_NAME], argv[YEAR] );
| |
- | }
| |
- |
| |
- | printf("make the database !\n");
| |
- | make_database( argv[TEAM_NAME], argv[DATABASE_NAME] );
| |
- |
| |
- | printf("start to make a dictionary !\n");
| |
- | make_dictionary( argv[TEAM_NAME], argv[DATABASE_NAME] );
| |
- |
| |
- | printf("start calculate SIP words !\n");
| |
- | make_sipword( argv[TEAM_NAME], argv[DATABASE_NAME] );
| |
- |
| |
- | return 0;
| |
- | }
| |
- |
| |
- | </pre>
| |
- |
| |
- | <br />
| |
- | <br />
| |
- | </p>
| |
- |
| |
- | == Downloads ==
| |
- |
| |
- | <p style="display:block">
| |
- | '''Team List'''
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Team list 2009 (UNIX)] <html><a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software"> |</a></html> [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Team list 2009 (WIN32)]
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Team list 2008 (UNIX)] <html><a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software"> |</a></html> [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Team list 2008 (WIN32)]
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Team list 2007 (UNIX)] <html><a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software"> |</a></html> [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Team list 2007 (WIN32)]
| |
- | '''Wiki Data'''
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Wiki data 2009 (TARGZ)] <html><a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software"> |</a></html> [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Wiki data 2009 (ZIP)]
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Wiki data 2008 (TARGZ)] <html><a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software"> |</a></html> [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Wiki data 2008 (ZIP)]
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Wiki data 2007 (TARGZ)] <html><a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software"> |</a></html> [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software Wiki data 2007 (ZIP)]
| |
- | '''SIP Database'''
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software SIP words database 2009]
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software SIP words database 2008]
| |
- | * [https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Software SIP words database 2007]
| |
- |
| |
- | <br />
| |
- | To read databases, use <b>[http://www.sqlite.org/ sqlite3]</b>.<br/>
| |
- | <br />
| |
- | <i><b>Warning :</b> Notice these files are generated using "links -dump" to remove html, to speed the process, but you can do without that, because SIP will remove them later. With links, some pages with special characters like '(' ')' and ':' in their name are not converted, we consider it's not very important, because it's a small number of pages, but you can re-gen the database without the html parse step.<br />
| |
- | You can also use html2text, but if the software find special character, it don't remove the html.<br />
| |
- | Also, I know there're some few bugs like in 2009's files : Illinois-tools is not downloaded... But unfortunately, I didn't fix them before the deadline.</i>
| |
- |
| |
- | <br />
| |
- | <br />
| |
- | </p>
| |
| | | |
| == References == | | == References == |