Team:Paris Liliane Bettencourt/Project/SIP/Codes

From 2010.igem.org

(Difference between revisions)
 
(7 intermediate revisions not shown)
Line 5: Line 5:
<p style="display:block">
<p style="display:block">
<a href=""https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Projects/SIP">
<a href=""https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Projects/SIP">
-
  <img src="https://static.igem.org/mediawiki/2010/4/4c/SIP.png" width="75" height="75" title="SIP">
+
  <img src="https://static.igem.org/mediawiki/2010/4/4c/SIP.png" width="148" height="120" title="SIP">
 +
</a>
 +
<font size=4>SIP Wiki Analyser </font>
 +
<a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Project/Synbioworld">
 +
<img src="https://static.igem.org/mediawiki/2010/2/25/SBW.jpg" width="129" height="107" align=right title="SynBioWorld">
</a>
</a>
-
<font size=4>SIP Wiki Analyser : Codes </font>
 
<a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Project/Population_counter">
<a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Project/Population_counter">
-
  <img src="https://static.igem.org/mediawiki/2010/9/93/Pop_counter_logo-01.jpg" width="108" height="89" align=right title="Population Counter">
+
  <img src="https://static.igem.org/mediawiki/2010/9/93/Pop_counter_logo-01.jpg" width="129" height="107" align=right title="Population Counter">
</a>
</a>
<a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Project/Memo-cell">
<a href="https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Project/Memo-cell">
-
  <img src="https://static.igem.org/mediawiki/2010/a/aa/Memo_cell-01.jpg" width="108" height="89" align=right title="Memo-Cell">
+
  <img src="https://static.igem.org/mediawiki/2010/a/aa/Memo_cell-01.jpg" width="129" height="107" align=right title="Memo-Cell">
</a> <br />
</a> <br />
</p>
</p>
Line 47: Line 50:
<br />
<br />
-
*'''TOP_SIP.C :'''  
+
*'''TOP_SIP.C :''' After you've generated SIP values, this bit of code allows you to pluck the top N SIP's, where N is any number you like.  Specify the database and the names of teams you wish to use in a list file.   
-
 
+
-
After you've generated SIP values, this bit of code allows you to pluck the top N SIP's, where N is any number you like.  Specify the database and the names of teams you wish to use in a list file.   
+
-
The last argument, coefficient, is the value used to decrement your hight sip value, if you haven't the number of words you want with the last computing, it's a number between ''1'' and ''15'', and give a number between ''10^-1'' and ''10^-15''. Adjust this number like you want, good coeff, give you better results and speed the software, but good coeff is not the highter.
+
If the software finds more than N words above your given SIP value, it randomly picks N words.
-
About the random selection : to take a good random words, I see how many words there are, so if you want all words has the same luck to be selected, each word have a probability of : ''nbr-of-words-selected/nbr-of-words-to-be-choosed'', so the virtual dice give me a number between ''0'' and ''(nbr-of-word/nbr-of-words-selected) - 1''.
+
If it finds less, it gradually decreases the SIP value needed to enter the list until the list is composed of N words. The last argument, coefficient, is the value used to decrement your sip value level used to define your list. It's a number between ''1'' and ''15''. Adjust this number like you want to improve results and software speed.
-
<br /><br />
+
<br /><br />
</p>
</p>
Line 60: Line 61:
<p style="display:block">
<p style="display:block">
<pre>
<pre>
-
/* SIP.C *** Make SIP database and dictionary
+
/* SIP.C *** Make SIP database and dictionary *** v 1.0
PUBLIC DOMAIN
PUBLIC DOMAIN
Line 67: Line 68:
comment:
comment:
-
  This code permit you to calculate SIP words (most improbable words) in each
+
  This code permits you to calculate SIP words (most improbable words) in each
  wiki team.
  wiki team.
  You need sqlite3, wget and links to run this program.
  You need sqlite3, wget and links to run this program.
Line 121: Line 122:
/*
/*
-
isRep say if the stream is a directory or a file.
+
isRep says if the stream is a directory or a file.
@entryname : name of the file
@entryname : name of the file
return : IS_REP it's a directory, NO_REP it's a file
return : IS_REP it's a directory, NO_REP it's a file
Line 471: Line 472:
char ch, last_ch;
char ch, last_ch;
char buffer[MAXBUFFER], team_name[MAXBUFFER-LEN_CMD], digest_name[MAXBUFFER-LEN_CMD];
char buffer[MAXBUFFER], team_name[MAXBUFFER-LEN_CMD], digest_name[MAXBUFFER-LEN_CMD];
 +
char dir_name[MAXBUFFER];
int size_name;
int size_name;
DIR *dp;
DIR *dp;
Line 493: Line 495:
team_name[size_name - 1] = '\0';
team_name[size_name - 1] = '\0';
-
chgchar( team_name, '-', '_', size_name);
+
size_name = strlen( dir_name );
 +
strncpy( dir_name, team_name, MAXBUFFER);
 +
chgchar( dir_name, '-', '_', size_name);
-
 
+
if ( mkdir(dir_name, 0777) == -1 ) {
-
if ( mkdir(team_name, 0777) == -1 ) {
+
printf("error in the mkdir\n");
printf("error in the mkdir\n");
exit(1);
exit(1);
}
}
-
if ( chdir(team_name) == -1 ) {
+
if ( chdir(dir_name) == -1 ) {
printf("error changing the directory to %s\n", team_name);
printf("error changing the directory to %s\n", team_name);
exit(1);
exit(1);
}
}
-
chgchar( team_name, '_', '-', size_name);
 
-
chgchar( team_name, '_', ' ', size_name);
 
if ( strcmp( year, "2007") == 0 ) {
if ( strcmp( year, "2007") == 0 ) {
snprintf(buffer, MAXBUFFER, "wget -R.jpg -R.png -R.gif \
snprintf(buffer, MAXBUFFER, "wget -R.jpg -R.png -R.gif \
-R.jpeg -RUser:* -E -l1 -nd -r \
-R.jpeg -RUser:* -E -l1 -nd -r \
-
http://parts.mit.edu/igem07/index.php/%s", team_name);
+
https://2007.igem.org/%s", team_name);
printf("CMD : %s\n", buffer);
printf("CMD : %s\n", buffer);
system(buffer);
system(buffer);
Line 598: Line 599:
{
{
// SELECT * from TEAM_NAME;
// SELECT * from TEAM_NAME;
-
// pour chaque result -> get the freq | get the word => get the freq of word in all wiki
+
// For each result -> get the freq | get the word => get the freq of word in all wiki
// f/F
// f/F
// UPDATE where word='yourword'
// UPDATE where word='yourword'
Line 830: Line 831:
return 0;
return 0;
}
}
-
 
-
 
-
 
</pre>
</pre>
Line 843: Line 841:
<p style="display:block">
<p style="display:block">
<pre>
<pre>
-
/* TOP_SIP.C *** Give the top high SIP words
+
/* TOP_SIP.C *** Give the top high SIP words *** v 1.0
PUBLIC DOMAIN
PUBLIC DOMAIN
Line 950: Line 948:
{
{
int i = 0;
int i = 0;
-
int n=TRUE;
+
int n=FALSE;
int alph_char = 0;
int alph_char = 0;
int numb_char = 0;
int numb_char = 0;
 +
//// DNA sequences filter
 +
while ( i < len )
 +
{
 +
if ( string[i] != 'a' || string[i] != 'g' || string[i] != 't' || string[i] != 'c' ) {
 +
n = TRUE;
 +
break;
 +
}
 +
i++;
 +
}
 +
 +
/// Alpha-numeric filter
 +
i = 0;
while ( i < len )
while ( i < len )
{
{
Line 1,140: Line 1,150:
top_freq[index] = (double)sqlite3_column_double(statement, FREQ_C);
top_freq[index] = (double)sqlite3_column_double(statement, FREQ_C);
index++;
index++;
-
 
-
//printf("index = %d\n", index); //debug
 
if ( rest <= 0 ) {
if ( rest <= 0 ) {
Line 1,157: Line 1,165:
prev_rest = rest;
prev_rest = rest;
nbr_words_registred = index;
nbr_words_registred = index;
-
 
-
if ( loop == TRUE ) {
 
-
goto LOOP;
 
-
}
 
}
}
Line 1,166: Line 1,170:
fp_team = fopen(team_name, "w");
fp_team = fopen(team_name, "w");
 +
 +
if( fp_team == 0 ) {
 +
printf("error opening the %s file\n", team_name);
 +
exit(1);
 +
}
for( index=0;index<TOP;index++) {
for( index=0;index<TOP;index++) {
if ( FILTER ) {
if ( FILTER ) {
-
if ( is_word_is_valid(top_words[index], strlen(top_words[index]) ) == TRUE ) {
+
if ( is_word_is_valid(top_words[index], strlen(top_words[index]) ) == FALSE ) {
-
printf("\t- %s (freq = %f; len=%d)\n", top_words[index],
+
continue;
-
(float)(top_freq[index] * multiplicator),
+
-
(int)(top_freq[index] * multiplicator));
+
-
 
+
-
fprintf(fp_team, "%s: %d\n", top_words[index],
+
-
(int)(top_freq[index] * multiplicator));
+
}
}
}
}
 +
 +
printf("\t- %s (freq = %f; len=%d)\n", top_words[index],
 +
(float)(top_freq[index] * multiplicator),
 +
(int)(top_freq[index] * multiplicator));
 +
 +
fprintf(fp_team, "%s: %d\n", top_words[index],
 +
(int)(top_freq[index] * multiplicator));
}
}
fclose( fp_team );
fclose( fp_team );

Latest revision as of 03:05, 28 October 2010



SIP Wiki Analyser







These two pieces of code make up our SIP system. We are releasing them without any sort of license or restriction to the public domain, in the hope that other teams will find them as useful as we have! Our SIP programs are written in C and make use of the sqlite3 API. The Paris SIP programs run natively in Linux but can be run on Mac OSX or windows (with Cygwin for wget and MinGW for sqlite3.) For the random module, we have used a C++ algorithm [http://www-personal.umich.edu/~wagnerr/MersenneTwister.html MersenneTwister], so G++ is needed to compile.

  • SIP.C : allow you to analyze any text and generate SIP's.
As set up here, it's configured to download iGEM wikis. To get a team's wiki, place the team name in a list; the software will calculate the frequency of all words in the text to establish a dictionary, and then calculate frequencies for each word on each team's page. SIP values are established, and the results are sorted in a database.
  • TOP_SIP.C : After you've generated SIP values, this bit of code allows you to pluck the top N SIP's, where N is any number you like. Specify the database and the names of teams you wish to use in a list file.
If the software finds more than N words above your given SIP value, it randomly picks N words. If it finds less, it gradually decreases the SIP value needed to enter the list until the list is composed of N words. The last argument, coefficient, is the value used to decrement your sip value level used to define your list. It's a number between 1 and 15. Adjust this number like you want to improve results and software speed.

SIP.C

/* SIP.C *** Make SIP database and dictionary *** v 1.0

PUBLIC DOMAIN
From iGEM team 2010 Paris

comment:

 This code permits you to calculate SIP words (most improbable words) in each
 wiki team.
 You need sqlite3, wget and links to run this program.

build:

 $ gcc -o sip sip.c -lsqlite3

usage :

 $ ./sip [list of team name] [name of database] [year]


Sqlite3 database :

Table for team.
+--------+-----------+------------+-----------+
| Words  | local occ | local freq | SIP value |
+--------+-----------+------------+-----------+
| string |   u_long  |  float     |   float   |  
|        |           |            |           |
+--------+-----------+------------+-----------+


Table name : Dictionary.
+--------+------------+-------------+
| word   | global_occ | global_freq |
+--------+------------+-------------+
| string |   u_long   |  float      |
|        |            |             |
+--------+------------+-------------+

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <sqlite3.h>

#define MAXBUFFER	256
#define LEN_CMD		96

#define TEAM_NAME 	1
#define DATABASE_NAME	2
#define YEAR		3

#define FALSE 		0
#define TRUE		1

#define IS_REP		1
#define NO_REP		2

/*
isRep says if the stream is a directory or a file.
@entryname : name of the file
return : IS_REP it's a directory, NO_REP it's a file
*/
int isRep( char *entryname )
{
	FILE *fp;
	fp = fopen( entryname, "rb" );

	if ( fp == 0 )
		return IS_REP;
	else {
		fclose( fp );
		return NO_REP; }
}

/*
Change a character by an other one in a string.
@string : your string
@c : char to remove
@r : char to set instead
@len : lenght of the string
return number of char changed.
*/
unsigned long int chgchar( char *string, char c, char r, int len )
{
	int i = 0;
	unsigned long int n=0;

	while ( i < len )
	{
		if ( string[i] == c )
		{
			
			string[i] = r;
			i++;
			n++;
		}
		else
			i++;
	}
	
	return n;
}

int close_sqlitedb( sqlite3*** conn )
{
	if ( sqlite3_close(**conn) != SQLITE_OK )
	{
		printf("Error closing the db: %s\n", sqlite3_errmsg(*conn));
	}
}

/*
Read the next word in your file
@word : buffer to put your next word
@fp : descriptor to our file.
*/
void read_next_word( char word[MAXBUFFER], FILE *fp )
{
	char buffer[MAXBUFFER];
	char ch;
	int n = 0;

	ch = fgetc(fp);

	if( ch == EOF ) {
		return;
	}

	while ( ch != 0x20 && ch != EOF && n < MAXBUFFER )
	{
		buffer[n] = ch;
		n++;
		ch = fgetc(fp);
	}
	buffer[n] = '\0';
	strcpy( word, buffer );
}

int is_word_valid( char *word )
{
// there's no filter at the moment.
// We can compare with MeSH white list ie.
}

/*
Check if the world is already in the database.
*/
int is_word_exists( char *word, sqlite3_stmt **statement, sqlite3** conn )
{
	if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
		printf("Error binding to query select: %s\n", sqlite3_errmsg(*conn));
		close_sqlitedb( &conn );
		exit(1);
	}

	if ( sqlite3_step( *statement ) != SQLITE_DONE ) {
		sqlite3_reset( *statement );
		return TRUE;
	} else {
		sqlite3_reset( *statement );
		return FALSE;
	}

}

/*
Add a word to the database.
*/
void add_to_the_list( char *word, sqlite3_stmt **statement, sqlite3** conn )
{
	if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
		printf("Error binding to query insert: %s\n", sqlite3_errmsg(*conn));
		close_sqlitedb( &conn );
		exit(1);
	}

	sqlite3_step( *statement );
	sqlite3_reset( *statement );
}

/*
Increment the occurence of this word, if is already in the database.
*/
void inc_occ( char *word, sqlite3_stmt **statement, sqlite3** conn )
{
// UPDATE dictionary SET global_occ = global_occ + 1 WHERE word=word
// INSERT into dictionary ...
	if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
		printf("Error binding to query update: %s\n", sqlite3_errmsg(*conn));
		close_sqlitedb( &conn );
		exit(1);
	}

	sqlite3_step( *statement );
	sqlite3_reset( *statement );
}

void compute_freq( sqlite3** conn, float nbr_of_words, char *table )
{
// UPDATE dictionary SET global_freq = global_occ / nbr_of_word
// note that nbr_of_word must be a REAL type.
/* For each words we compute : F = occ_of_word / nbr_of_words */
	char sqlite3_query[MAXBUFFER];
	sqlite3_stmt *statement;

	sprintf( sqlite3_query, "UPDATE %s SET global_freq = global_occ / %f",
		table, nbr_of_words);

	if ( sqlite3_prepare_v2(*conn, sqlite3_query, -1, &statement, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}
	sqlite3_step( statement );
	sqlite3_finalize( statement );
}

make_dictionary( char *team_list, char *database_name)
{
	FILE *fp, *fp_wiki;
	float nbr_of_words = 0;
	char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD];
	int size_name;

	sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update;

	sqlite3 *conn;
	if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
	{
		printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
		exit(1);
	}

	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	// pre-compile the query to optimize the process
	if ( sqlite3_prepare_v2(conn, "SELECT * FROM dictionary WHERE word=?", -1,
		&stmt_select, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}

	if ( sqlite3_prepare_v2(conn, "INSERT INTO dictionary VALUES (?, 1, NULL)", -1,
		&stmt_insert, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}

	if ( sqlite3_prepare_v2(conn, "UPDATE dictionary SET global_occ = global_occ + 1 WHERE word=?",
		-1, &stmt_update, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';
		chgchar( team_name, '-', '_', size_name);

		printf("On the %s Team\n", team_name);

		if ( chdir(team_name) == -1 ) {
			printf("error changing directory to %s\n", team_name);
			exit(1);
		}

		strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) );
		strcat( digest_name, ".xtr");

		fp_wiki = fopen( digest_name, "r" );
		if( fp_wiki == 0 ) {
			printf("error when you open %s file", digest_name);
			exit(1);
		}

		if ( chdir("..") == -1 ) {
			printf("error changing directory to ..\n");
			exit(1);
		}


		sqlite3_exec(conn, "begin", NULL, NULL, NULL );
		while ( !feof(fp_wiki) )
		{
			read_next_word( word, fp_wiki );
			if ( is_word_valid( word ) == FALSE ) {
				continue;
			}
			if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) {
				add_to_the_list( word, &stmt_insert, &conn );
			}
			else {
				inc_occ( word, &stmt_update, &conn );
			}
		
			nbr_of_words++;
		}
		sqlite3_exec(conn, "commit", NULL, NULL, NULL );
		fclose(fp_wiki);
	}
	fclose(fp);
	sqlite3_finalize( stmt_select );
	sqlite3_finalize( stmt_insert );
	sqlite3_finalize( stmt_update );

	compute_freq( &conn, nbr_of_words, "dictionary" );

CLOSE_DB :
	if ( sqlite3_close(conn) != SQLITE_OK ) {
		printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
	}
}

void make_database( char *team_list, char *database_name )
{
	sqlite3 *conn;
	sqlite3_stmt *statement;
	FILE* fp;
	int size_name;
	char team_name[MAXBUFFER-LEN_CMD], sqlite_query[MAXBUFFER];

	if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
	{
		printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
		exit(1);
	}

	if ( sqlite3_prepare_v2(conn,
		"CREATE TABLE dictionary ( word text, global_occ int, global_freq real )",
		-1, &statement, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}
	
	sqlite3_step( statement );
	sqlite3_finalize( statement );

	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';
		chgchar( team_name, '-', '_', size_name);

		printf("Create %s Team Table\n", team_name);

		sprintf( sqlite_query,
			"CREATE TABLE %s ( word text, global_occ int, global_freq real, sip real )",
			team_name);
		if ( sqlite3_prepare_v2(conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}

		sqlite3_step( statement );
		sqlite3_reset( statement );

	}
	fclose( fp);
	sqlite3_finalize( statement );

CLOSE_DB :
	if ( sqlite3_close(conn) != SQLITE_OK ) {
		printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
	}
	
}

void download_wiki( char *team_list, char* year )
{
	FILE *fp_src, *fp_tar, *fp;
	char ch, last_ch;
	char buffer[MAXBUFFER], team_name[MAXBUFFER-LEN_CMD], digest_name[MAXBUFFER-LEN_CMD];
	char dir_name[MAXBUFFER];
	int size_name;
	DIR *dp;
	struct dirent *entry;

	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';

		size_name = strlen( dir_name );
		strncpy( dir_name, team_name, MAXBUFFER);
		chgchar( dir_name, '-', '_', size_name);

		if ( mkdir(dir_name, 0777) == -1 ) {
			printf("error in the mkdir\n");
			exit(1);
		}

		if ( chdir(dir_name) == -1 ) {
			printf("error changing the directory to %s\n", team_name);
			exit(1);
		}


		if ( strcmp( year, "2007") == 0 ) {
			snprintf(buffer, MAXBUFFER, "wget -R.jpg -R.png -R.gif \
				-R.jpeg -RUser:* -E -l1 -nd -r \
				https://2007.igem.org/%s", team_name);
			printf("CMD : %s\n", buffer);
			system(buffer);
		} else {
			snprintf(buffer, MAXBUFFER, "wget -E -ITeam:%s -nd -r \
				http://%s.igem.org/Team:%s", team_name, year, team_name);
			printf("CMD : %s\n", buffer);
			system(buffer);
		}
	
		if((dp = opendir(".")) == NULL) {
			printf("can't open the dir");
			return;
		}
        	   
		while((entry = readdir(dp)) != NULL) {
			if ( isRep(entry->d_name) != IS_REP ) {
				if ( (strstr(entry->d_name, ".html") != NULL)
					&& (strstr(entry->d_name, ".dumpx") == NULL) ) {
				snprintf(buffer, MAXBUFFER, "links -dump %s > %s.dumpx",
					entry->d_name, entry->d_name);
				system(buffer);
				}	
			}
		}
		closedir(dp);

		chgchar( team_name, '-', '_', size_name);


		snprintf(buffer, MAXBUFFER, "cat *.dumpx > %s", team_name);
		system(buffer);

		strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) );
		strcat( digest_name, ".xtr");
	
		fp_src = fopen(team_name, "r");
		if ( fp == 0 ) {
			printf("Can't open %s file\n", team_name);
			exit(1);
		}

		fp_tar = fopen(digest_name, "w");
		if ( fp == 0 ) {
			printf("Can't open %s file\n", digest_name);
			exit(1);
		}

		while( !feof(fp_src) )
		{
			ch = fgetc(fp_src);
	
			if ( (ch >= 0x41 && ch <= 0x5A) ||
			(ch >= 0x61 && ch <= 0x7A) ||
			(ch == 0x20 && last_ch != 0x20) ||
			ch == 0x2D ||
			(ch >= 0x30 && ch <= 0x39) ) {
				if (ch >= 0x41 && ch <= 0x5A) { //put in lowercase
					ch += 0x20;
				}
				fputc(ch, fp_tar);
			}
			else {
				ch = 0x20;
	
				if ( last_ch != 0x20 ) {
					fputc(ch, fp_tar);
				}
			}
			last_ch = ch;
		}
		
		fclose( fp_src );
		fclose( fp_tar );

		if ( chdir("..") == -1 ) {
			printf("error changing the directory to %s\n", team_name);
			exit(1);
		}
	}
	fclose(fp);
}

void compute_sip( sqlite3 **conn, char *team_name )
{
// SELECT * from TEAM_NAME;
// For each result -> get the freq | get the word => get the freq of word in all wiki
// f/F
// UPDATE where word='yourword'

	sqlite3_stmt *statement, *stmt_select, *stmt_update;
	char sqlite_query[MAXBUFFER];
	float local_freq, global_freq;
	int res;

	snprintf( sqlite_query, MAXBUFFER, "SELECT * from %s", team_name);
	if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK ) {
		printf("compute_sip() : Error compiling the request 1: %s\n",
			sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}

	if ( sqlite3_prepare_v2(*conn, "SELECT * from dictionary WHERE word=?", -1,
				&stmt_select, NULL ) != SQLITE_OK )
	{
		printf("compute_sip() : Error compiling the request 2: %s\n",
			sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}

	snprintf( sqlite_query, MAXBUFFER, "UPDATE %s SET sip = ?/? WHERE word=?",
		team_name);
	if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &stmt_update, NULL ) != SQLITE_OK ) {
		printf("compute_sip() : Error compiling the request 3: %s\n",
			sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}

	sqlite3_exec(*conn, "begin", NULL, NULL, NULL );
	while( (res = sqlite3_step(statement)) == SQLITE_ROW)
	{
		local_freq = (float)sqlite3_column_double(statement, 2);

		if ( sqlite3_bind_text( stmt_select, 1, sqlite3_column_text(statement, 0), -1,
					SQLITE_STATIC) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 1: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}
		sqlite3_step( stmt_select );
		global_freq = (float)sqlite3_column_double(stmt_select, 2);

		if ( sqlite3_bind_double( stmt_update, 1, (double)local_freq) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 2: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}

		if ( sqlite3_bind_double( stmt_update, 2, (double)global_freq) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 3: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}

		if ( sqlite3_bind_text( stmt_update, 3, sqlite3_column_text(statement, 0), -1,
					SQLITE_STATIC) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 4: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}

		sqlite3_step( stmt_update );

		sqlite3_reset( stmt_select );
		sqlite3_reset( stmt_update );
	}
	sqlite3_exec(*conn, "commit", NULL, NULL, NULL );

	sqlite3_finalize( statement );
	sqlite3_finalize( stmt_select );
	sqlite3_finalize( stmt_update );
}

void make_sipword( char *team_list, char *database_name)
{
	FILE *fp_wiki, *fp;
	float nbr_of_words = 0;
	char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD];
	char sqlite_query[MAXBUFFER];
	int size_name;

	sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update;

	sqlite3 *conn;
	if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
	{
		printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
		exit(1);
	}
	
	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';
		chgchar( team_name, '-', '_', size_name);


		printf("On the %s Team\n", team_name);

		// pre-compile the query to optimize the process
		snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s WHERE word=?",
			team_name);

		if ( sqlite3_prepare_v2(conn, sqlite_query , -1,
			&stmt_select, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}

		snprintf( sqlite_query, MAXBUFFER, "INSERT INTO %s \
			VALUES (?, 1, NULL, NULL)", team_name);

		if ( sqlite3_prepare_v2(conn, sqlite_query, -1,
			&stmt_insert, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}
	
		snprintf( sqlite_query, MAXBUFFER, "UPDATE %s \
			SET global_occ = global_occ + 1 WHERE word=?", team_name);

		if ( sqlite3_prepare_v2(conn, sqlite_query, -1,
			&stmt_update, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}
	
		if ( chdir(team_name) == -1 ) {
			printf("error changing directory to %s\n", team_name);
			exit(1);
		}

		strncpy( digest_name, team_name, MAXBUFFER-LEN_CMD-4 );
		strcat( digest_name, ".xtr");

		fp_wiki = fopen( digest_name, "r" );
		if( fp_wiki == 0 ) {
			printf("error when you open %s file", digest_name);
			exit(1);
		}

		if ( chdir("..") == -1 ) {
			printf("error changing directory to ..\n");
			exit(1);
		}


		sqlite3_exec(conn, "begin", NULL, NULL, NULL );
		while ( !feof(fp_wiki) )
		{
			read_next_word( word, fp_wiki );
			if ( is_word_valid( word ) == FALSE ) {
				continue;
			}
			if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) {
				add_to_the_list( word, &stmt_insert, &conn );
			}
			else {
				inc_occ( word, &stmt_update, &conn );
			}
		
			nbr_of_words++;
		}
		sqlite3_exec(conn, "commit", NULL, NULL, NULL );
		fclose(fp_wiki);
		sqlite3_finalize( stmt_select );
		sqlite3_finalize( stmt_insert );
		sqlite3_finalize( stmt_update );

		compute_freq( &conn, nbr_of_words, team_name );
		compute_sip( &conn, team_name);
	}

CLOSE_DB :
	if ( sqlite3_close(conn) != SQLITE_OK ) {
		printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
	}
}

int main( int argc, char *argv[] )
{

	if ( argv[TEAM_NAME] == 0 || argv[DATABASE_NAME] == 0 ) {
		printf("usage : %s [list of team name] [name of database] [year]\n \
	- don't put [year] if you want to skip the download step.\n", argv[0] );
		exit(1);
	}

	if ( argv[YEAR] == 0 ) {
		printf("skip the downlad step !\n");
	} else {
		printf("start to download wiki !\n");
		download_wiki( argv[TEAM_NAME], argv[YEAR] );
	}

	printf("make the database !\n");
	make_database( argv[TEAM_NAME], argv[DATABASE_NAME] );

	printf("start to make a dictionary !\n");
	make_dictionary( argv[TEAM_NAME], argv[DATABASE_NAME] );

	printf("start calculate SIP words !\n");
	make_sipword( argv[TEAM_NAME], argv[DATABASE_NAME] );

	return 0;
}



TOP-SIP.CPP

/* TOP_SIP.C *** Give the top high SIP words *** v 1.0

PUBLIC DOMAIN
From iGEM team 2010 Paris

comment:

 This code allows you to get the words with the high SIP value
 Use SIP.C before using this tools.
 For the random module, you can use what you want, we use
 MersenneTwister written in C++, that's why you have to compile
 with g++ in this case.

 MersenneTwister : http://www-personal.umich.edu/~wagnerr/MersenneTwister.html

build:

 $ g++ -o top-sip top-sip.c -lsqlite3

usage :

 $ ./top-sip [list of team name] [name of database] [coeff]


*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <sqlite3.h>
#include "MersenneTwister.hpp"


#define MAXBUFFER	256
#define LEN_CMD		96

#define TEAM_NAME 	1
#define DATABASE_NAME	2
#define DEC_COEFF	3

#define FALSE 		0
#define TRUE		1

#define MIN_COEFF	0
#define MAX_COEFF	15

#define WORD_C	0
#define FREQ_C	2
#define SIP_C	3

/******** customizable constant **********/

//// The number of result you want
#define TOP 	25

//// Active the filter to remove words with a small number of alpha char,
//// but big number
//// of numeric char
#define FILTER		1
//// Minimum numeric char to ban the word
#define CHAR_NUM_MIN	1
//// Maximum alpha char to ban the word
#define CHAR_ALPHA_MAX	3

/******************************************/



/*
Change a character by an other one in a string.
@string : your string
@c : char to remove
@r : char to set instead
@len : lenght of the string
return number of char changed.
*/
unsigned long int chgchar( char *string, char c, char r, int len )
{
	int i = 0;
	unsigned long int n=0;

	while ( i < len )
	{
		if ( string[i] == c )
		{
			
			string[i] = r;
			i++;
			n++;
		}
		else
			i++;
	}
	
	return n;
}

/*
Check if the word has number, if is a word we want.
@string : your woord
@len: lenght of your string
return: FALSE if you need to remove the word
*/
int is_word_is_valid( char *string, int len )
{
	int i = 0;
	int n=FALSE;
	int alph_char = 0;
	int numb_char = 0;

	//// DNA sequences filter
	while ( i < len )
	{
		if ( string[i] != 'a' || string[i] != 'g' || string[i] != 't' || string[i] != 'c' ) {
			n = TRUE;
			break;
		}
		i++;
	}

	/// Alpha-numeric filter
	i = 0;
	while ( i < len )
	{
		if ( (string[i] >= 0x41 && string[i] <= 0x5A) || (string[i] >= 0x61 && string[i] <= 0x7A) ) {
			alph_char++;
		}
		else {
			numb_char++;
		}
		i++;
	}

	if( numb_char >= CHAR_NUM_MIN && alph_char <= CHAR_ALPHA_MAX) {
		n = FALSE;
	}

	return n;
}

/*
pick a random number between 0 and X, use your favorite random algorithm ! I use Mersenne Twister.
@prob : your X, so you have 1/X prob to get the good number. X+1 numbers can be choosen.
return : the random number
*/
int pickrandom( int prob )
{
	MTRand mtrand1;
	int d = mtrand1.randInt( prob );
	return d;
}

unsigned long int compute_mul( double min_freq )
{
	unsigned long int mult = 1;

	while( min_freq < 1 )
	{
		min_freq = min_freq * 10;
		mult = mult * 10;
	}

	return mult;
}

void compute_top_sip( char *team_list, char *database_name, float dec_coeff )
{
	FILE *fp, *fp_team;
	sqlite3_stmt *statement;

	char sqlite_query[MAXBUFFER];
	char top_words[TOP][MAXBUFFER];
	char team_name[MAXBUFFER-LEN_CMD];

	int wanted_stuff, total_stuff, max_interval, i;
	unsigned long int t, multiplicator;
	int size_name, index;
	double top_freq[TOP];
	double max_sip, curr_sip;
	double min_freq, curr_freq;

	sqlite3 *conn;
	if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
	{
		printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
		exit(1);
	}

	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	while( !feof(fp) )
	{

		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';
		chgchar( team_name, '-', '_', size_name);

		printf("On the %s Team\n", team_name);

	FIND_HIGH_SIP:
		max_sip = 0, min_freq = 1;

		snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s", team_name);

		if ( sqlite3_prepare_v2(conn, sqlite_query , -1,
			&statement, NULL ) != SQLITE_OK )
		{
			printf("COMPUTE HIGHT SIP : Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}

		t = 0;
		while ( sqlite3_step( statement ) != SQLITE_DONE ) {
			curr_sip = (double)sqlite3_column_double(statement, SIP_C);
			if ( curr_sip > max_sip ) {
				max_sip = curr_sip;
			}
			curr_freq = (double)sqlite3_column_double(statement, FREQ_C);
			if ( curr_freq < min_freq ) {
				min_freq = curr_freq;
			}
			t++;
		}
		// For Wordle, to avoid float value < 0 we use a multiplicator.
		multiplicator = compute_mul( min_freq );
		printf("hight sip is %.15f, for %ld entry and multiplicator %ld\n", max_sip, t, multiplicator);
		sqlite3_finalize( statement );

	GET_THE_TOP_WORDS:
		int loop = TRUE;
		int prev_rest = TOP;
		int rest = 0;
		curr_sip = max_sip;
		index = 0;
		int nbr_words_registred = 0;

		while( loop == TRUE )
		{
			max_interval = 0;
			wanted_stuff = TOP;
			total_stuff = 0;
			loop = FALSE;
	
			snprintf( sqlite_query, MAXBUFFER,
				"SELECT * FROM %s WHERE sip > %.15f AND sip <= %.15f",
				team_name, curr_sip, max_sip);
	
			if ( sqlite3_prepare_v2(conn, sqlite_query , -1,
				&statement, NULL ) != SQLITE_OK )
			{
				printf("PICK TOP FIVE : Error compiling the request: %s\n",
					sqlite3_errmsg(conn) );
				goto CLOSE_DB;
				exit(1);
			}
	
	
			while ( sqlite3_step( statement ) != SQLITE_DONE ) {
				total_stuff++;
			}
	
			rest = prev_rest - total_stuff;
			//printf("rest = %d, prev_rest=%d, total_stuff = %d\n", rest, prev_rest, total_stuff);
			// debug
	
			if( rest > 0 ) {
	
				if ( rest < prev_rest ) {
					max_sip = curr_sip;
				}
				curr_sip = curr_sip - dec_coeff;
				if ( curr_sip <= 0 ) {
					continue;
				}
				loop = TRUE;
			}
	
			max_interval = total_stuff / prev_rest; // we put the fraction to 1/X and we save X.
			max_interval--;
	
			if ( max_interval <= 0 ) {
				max_interval = 0;
			}
	
			/*printf("prob is : %f, (%d/%d), give a number between 0 and %d\n",
				(float)prev_rest / total_stuff, prev_rest, total_stuff, max_interval);*/
			// debug
	
	
			while ( sqlite3_step( statement ) != SQLITE_DONE && rest != prev_rest ) {
	
				if ( pickrandom(max_interval) == FALSE ) {
					strncpy( top_words[index],
						(const char*)sqlite3_column_text(statement, WORD_C), MAXBUFFER);
					top_freq[index] = (double)sqlite3_column_double(statement, FREQ_C);
					index++;
	
					if ( rest <= 0 ) {
						if ( (index - nbr_words_registred) == prev_rest ) {
						break;
						}
					} else { 
						if ( (index - nbr_words_registred) == total_stuff ) {
							break;
						}
					}
				}		
			}		
			
			prev_rest = rest;
			nbr_words_registred = index;
		}
	
		printf("Your top-words for this team is :\n");

		fp_team = fopen(team_name, "w");

		if( fp_team == 0 ) {
			printf("error opening the %s file\n", team_name);
			exit(1);
		}

		for( index=0;index<TOP;index++) {
			if ( FILTER ) {
				if ( is_word_is_valid(top_words[index], strlen(top_words[index]) ) == FALSE ) {
					continue;
				}
			}
			
			printf("\t- %s (freq = %f; len=%d)\n", top_words[index],
				(float)(top_freq[index] * multiplicator),
				(int)(top_freq[index] * multiplicator));

			fprintf(fp_team, "%s: %d\n", top_words[index],
				(int)(top_freq[index] * multiplicator));
		}
		fclose( fp_team );
		sqlite3_finalize( statement );
		
	}
	fclose(fp);

CLOSE_DB :
	if ( sqlite3_close(conn) != SQLITE_OK ) {
		printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
	}
}

int main( int argc, char *argv[] )
{

	if ( argv[TEAM_NAME] == 0 || argv[DATABASE_NAME] == 0 || argv[DEC_COEFF] == 0 ) {
		printf("usage : %s [list of team name] [name of database] [dec-coeff]\n", argv[0] );
		printf("/tUse 0 to 15 to define a dec-coeff.\n");
		exit(1);
	}

	if ( atoi( argv[DEC_COEFF]) >= MIN_COEFF && atoi( argv[DEC_COEFF]) <= MAX_COEFF ) {
		printf("Compute with 10^-%s dec_coeff!\n", argv[DEC_COEFF]);
		compute_top_sip( argv[TEAM_NAME], argv[DATABASE_NAME], powf( 10, -atoi(argv[DEC_COEFF])) );
	} else {
		printf("Use 0 to 15 to define a dec-coeff.\n");
	}

	return 0;
}