Team:Paris Liliane Bettencourt/Project/SIP/Codes

From 2010.igem.org

(Difference between revisions)
Theotime (Talk | contribs)
(New page: {{Template:Paris2010_2}} <html> <p style="display:block"> <a href=""https://2010.igem.org/Team:Paris_Liliane_Bettencourt/Projects/SIP"> <img src="https://static.igem.org/mediawiki/2010/4/4c/SI...)
Newer edit →

Revision as of 09:41, 24 October 2010



SIP Wiki Analyser : Codes





SIP.C

/* SIP.C *** Make SIP database and dictionary

PUBLIC DOMAIN
From iGEM team 2010 Paris

comment:

 This code permit you to calculate SIP words (most improbable words) in each
 wiki team.
 You need sqlite3, wget and links to run this program.

build:

 $ gcc -o sip sip.c -lsqlite3

usage :

 $ ./sip [list of team name] [name of database] [year]


Sqlite3 database :

Table for team.
+--------+-----------+------------+-----------+
| Words  | local occ | local freq | SIP value |
+--------+-----------+------------+-----------+
| string |   u_long  |  float     |   float   |  
|        |           |            |           |
+--------+-----------+------------+-----------+


Table name : Dictionary.
+--------+------------+-------------+
| word   | global_occ | global_freq |
+--------+------------+-------------+
| string |   u_long   |  float      |
|        |            |             |
+--------+------------+-------------+

*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <dirent.h>
#include <sqlite3.h>

#define MAXBUFFER	256
#define LEN_CMD		96

#define TEAM_NAME 	1
#define DATABASE_NAME	2
#define YEAR		3

#define FALSE 		0
#define TRUE		1

#define IS_REP		1
#define NO_REP		2

/*
isRep say if the stream is a directory or a file.
@entryname : name of the file
return : IS_REP it's a directory, NO_REP it's a file
*/
int isRep( char *entryname )
{
	FILE *fp;
	fp = fopen( entryname, "rb" );

	if ( fp == 0 )
		return IS_REP;
	else {
		fclose( fp );
		return NO_REP; }
}

void chgchar( char *string, char c, char r, int len )
{
	int i = 0;

	while ( i < len )
	{
		if ( string[i] == c )
		{
			
			string[i] = r;
			i++;
		}
		else
			i++;
	}
}

int close_sqlitedb( sqlite3*** conn )
{
	if ( sqlite3_close(**conn) != SQLITE_OK )
	{
		printf("Error closing the db: %s\n", sqlite3_errmsg(*conn));
	}
}

void read_next_word( char word[MAXBUFFER], FILE *fp )
{
	char buffer[MAXBUFFER];
	char ch;
	int n = 0;

	ch = fgetc(fp);

	if( ch == EOF ) {
		return;
	}

	while ( ch != 0x20 && ch != EOF && n < MAXBUFFER )
	{
		buffer[n] = ch;
		n++;
		ch = fgetc(fp);
	}
	buffer[n] = '\0';
	strcpy( word, buffer );
}

int is_word_valid( char word )
{
// there's no filter at the moment.
// We can compare with MeSH white list ie.
}

int is_word_exists( char *word, sqlite3_stmt **statement, sqlite3** conn )
{
	if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
		printf("Error binding to query select: %s\n", sqlite3_errmsg(*conn));
		close_sqlitedb( &conn );
		exit(1);
	}

	if ( sqlite3_step( *statement ) != SQLITE_DONE ) {
		sqlite3_reset( *statement );
		return TRUE;
	} else {
		sqlite3_reset( *statement );
		return FALSE;
	}

}

void add_to_the_list( char *word, sqlite3_stmt **statement, sqlite3** conn )
{
	if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
		printf("Error binding to query insert: %s\n", sqlite3_errmsg(*conn));
		close_sqlitedb( &conn );
		exit(1);
	}

	sqlite3_step( *statement );
	sqlite3_reset( *statement );
}

void inc_occ( char *word, sqlite3_stmt **statement, sqlite3** conn )
{
// UPDATE dictionary SET global_occ = global_occ + 1 WHERE word=word
// INSERT into dictionary ...
	if ( sqlite3_bind_text( *statement, 1, word, -1, SQLITE_STATIC) != SQLITE_OK ) {
		printf("Error binding to query update: %s\n", sqlite3_errmsg(*conn));
		close_sqlitedb( &conn );
		exit(1);
	}

	sqlite3_step( *statement );
	sqlite3_reset( *statement );
}

void compute_freq( sqlite3** conn, float nbr_of_words, char *table )
{
// UPDATE dictionary SET global_freq = global_occ / nbr_of_word
// note that nbr_of_word must be a REAL type.
/* For each words we compute : F = occ_of_word / nbr_of_words */
	char sqlite3_query[MAXBUFFER];
	sqlite3_stmt *statement;

	sprintf( sqlite3_query, "UPDATE %s SET global_freq = global_occ / %f",
		table, nbr_of_words);

	if ( sqlite3_prepare_v2(*conn, sqlite3_query, -1, &statement, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}
	sqlite3_step( statement );
	sqlite3_finalize( statement );
}

make_dictionary( char *team_list, char *database_name)
{
	FILE *fp, *fp_wiki;
	float nbr_of_words = 0;
	char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD];
	int size_name;

	sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update;

	sqlite3 *conn;
	if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
	{
		printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
		exit(1);
	}

	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	// pre-compile the query to optimize the process
	if ( sqlite3_prepare_v2(conn, "SELECT * FROM dictionary WHERE word=?", -1,
		&stmt_select, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}

	if ( sqlite3_prepare_v2(conn, "INSERT INTO dictionary VALUES (?, 1, NULL)", -1,
		&stmt_insert, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}

	if ( sqlite3_prepare_v2(conn, "UPDATE dictionary \
		SET global_occ = global_occ + 1 WHERE word=?",
		-1, &stmt_update, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';
		chgchar( team_name, '-', '_', size_name);
		printf("On the %s Team\n", team_name);

		if ( chdir(team_name) == -1 ) {
			printf("error changing directory to %s\n", team_name);
			exit(1);
		}

		strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) );
		strcat( digest_name, ".xtr");

		fp_wiki = fopen( digest_name, "r" );
		if( fp_wiki == 0 ) {
			printf("error when you open %s file", digest_name);
			exit(1);
		}

		if ( chdir("..") == -1 ) {
			printf("error changing directory to ..\n");
			exit(1);
		}


		sqlite3_exec(conn, "begin", NULL, NULL, NULL );
		while ( !feof(fp_wiki) )
		{
			read_next_word( word, fp_wiki );
			if ( is_word_valid( word ) == FALSE ) {
				continue;
			}
			if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) {
				add_to_the_list( word, &stmt_insert, &conn );
			}
			else {
				inc_occ( word, &stmt_update, &conn );
			}
		
			nbr_of_words++;
		}
		sqlite3_exec(conn, "commit", NULL, NULL, NULL );
		fclose(fp_wiki);
	}
	fclose(fp);
	sqlite3_finalize( stmt_select );
	sqlite3_finalize( stmt_insert );
	sqlite3_finalize( stmt_update );

	compute_freq( &conn, nbr_of_words, "dictionary" );

CLOSE_DB :
	if ( sqlite3_close(conn) != SQLITE_OK ) {
		printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
	}
}

void make_database( char *team_list, char *database_name )
{
	sqlite3 *conn;
	sqlite3_stmt *statement;
	FILE* fp;
	int size_name;
	char team_name[MAXBUFFER-LEN_CMD], sqlite_query[MAXBUFFER];

	if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
	{
		printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
		exit(1);
	}

	if ( sqlite3_prepare_v2(conn, "CREATE TABLE dictionary ( word text,  \
		global_occ int, global_freq real )", -1, &statement, NULL ) != SQLITE_OK )
	{
		printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
		goto CLOSE_DB;
		exit(1);
	}
	
	sqlite3_step( statement );
	sqlite3_finalize( statement );

	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';
		chgchar( team_name, '-', '_', size_name);
		printf("Create %s Team Table\n", team_name);

		sprintf( sqlite_query, "CREATE TABLE %s ( word text, global_occ int, \
			global_freq real, sip real )", team_name);
		if ( sqlite3_prepare_v2(conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}

		sqlite3_step( statement );
		sqlite3_reset( statement );

	}
	fclose( fp);
	sqlite3_finalize( statement );

CLOSE_DB :
	if ( sqlite3_close(conn) != SQLITE_OK ) {
		printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
	}
	
}

void download_wiki( char *team_list, char* year )
{
	FILE *fp_src, *fp_tar, *fp;
	char ch, last_ch;
	char buffer[MAXBUFFER], team_name[MAXBUFFER-LEN_CMD], digest_name[MAXBUFFER-LEN_CMD];
	int size_name;
	DIR *dp;
	struct dirent *entry;

	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';

		chgchar( team_name, '-', '_', size_name);

		if ( mkdir(team_name, 0777) == -1 ) {
			printf("error in the mkdir\n");
			exit(1);
		}

		if ( chdir(team_name) == -1 ) {
			printf("error changing the directory to %s\n", team_name);
			exit(1);
		}

		if ( strcmp( year, "2007") == 0 ) {
			snprintf(buffer, MAXBUFFER, "wget -R.jpg -R.png -R.gif \
				-R.jpeg -RUser:* -E -l1 -nd -r \
				http://parts.mit.edu/igem07/index.php/%s", team_name);
			printf("CMD : %s\n", buffer);
			system(buffer);
		} else {
			snprintf(buffer, MAXBUFFER, "wget -E -ITeam:%s -nd -r \
				http://%s.igem.org/Team:%s", team_name, year, team_name);
			printf("CMD : %s\n", buffer);
			system(buffer);
		}
	
		if((dp = opendir(".")) == NULL) {
			printf("can't open the dir");
			return;
		}
        	   
		while((entry = readdir(dp)) != NULL) {
			if ( isRep(entry->d_name) != IS_REP ) {
				if ( (strstr(entry->d_name, ".html") != NULL)
					&& (strstr(entry->d_name, ".dumpx") == NULL) ) {
				snprintf(buffer, MAXBUFFER, "links -dump %s > %s.dumpx",
					entry->d_name, entry->d_name);
				system(buffer);
				}	
			}
		}
		closedir(dp);
		snprintf(buffer, MAXBUFFER, "cat *.dumpx > %s", team_name);
		system(buffer);

		strncpy( digest_name, team_name, (MAXBUFFER-LEN_CMD-4) );
		strcat( digest_name, ".xtr");
	
		fp_src = fopen(team_name, "r");
		if ( fp == 0 ) {
			printf("Can't open %s file\n", team_name);
			exit(1);
		}

		fp_tar = fopen(digest_name, "w");
		if ( fp == 0 ) {
			printf("Can't open %s file\n", digest_name);
			exit(1);
		}

		while( !feof(fp_src) )
		{
			ch = fgetc(fp_src);
	
			if ( (ch >= 0x41 && ch <= 0x5A) ||
			(ch >= 0x61 && ch <= 0x7A) ||
			(ch == 0x20 && last_ch != 0x20) ||
			ch == 0x2D ||
			(ch >= 0x30 && ch <= 0x39) ) {
				if (ch >= 0x41 && ch <= 0x5A) { //put in lowercase
					ch += 0x20;
				}
				fputc(ch, fp_tar);
			}
			else {
				ch = 0x20;
	
				if ( last_ch != 0x20 ) {
					fputc(ch, fp_tar);
				}
			}
			last_ch = ch;
		}
		
		fclose( fp_src );
		fclose( fp_tar );

		if ( chdir("..") == -1 ) {
			printf("error changing the directory to %s\n", team_name);
			exit(1);
		}
	}
	fclose(fp);
}

void compute_sip( sqlite3 **conn, char *team_name )
{
// SELECT * from TEAM_NAME;
// pour chaque result -> get the freq | get the word => get the freq of word in all wiki
// f/F
// UPDATE where word='yourword'

	sqlite3_stmt *statement, *stmt_select, *stmt_update;
	char sqlite_query[MAXBUFFER];
	float local_freq, global_freq;
	int res;

	snprintf( sqlite_query, MAXBUFFER, "SELECT * from %s", team_name);
	if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &statement, NULL ) != SQLITE_OK ) {
		printf("compute_sip() : Error compiling the request 1: %s\n",
			sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}

	if ( sqlite3_prepare_v2(*conn, "SELECT * from dictionary WHERE word=?", -1,
				&stmt_select, NULL ) != SQLITE_OK )
	{
		printf("compute_sip() : Error compiling the request 2: %s\n",
			sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}

	snprintf( sqlite_query, MAXBUFFER, "UPDATE %s SET sip = ?/? WHERE word=?",
		team_name);
	if ( sqlite3_prepare_v2(*conn, sqlite_query, -1, &stmt_update, NULL ) != SQLITE_OK ) {
		printf("compute_sip() : Error compiling the request 3: %s\n",
			sqlite3_errmsg(*conn) );
		close_sqlitedb( &conn );
		exit(1);
	}

	sqlite3_exec(*conn, "begin", NULL, NULL, NULL );
	while( (res = sqlite3_step(statement)) == SQLITE_ROW)
	{
		local_freq = (float)sqlite3_column_double(statement, 2);

		if ( sqlite3_bind_text( stmt_select, 1, sqlite3_column_text(statement, 0), -1,
					SQLITE_STATIC) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 1: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}
		sqlite3_step( stmt_select );
		global_freq = (float)sqlite3_column_double(stmt_select, 2);

		if ( sqlite3_bind_double( stmt_update, 1, (double)local_freq) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 2: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}

		if ( sqlite3_bind_double( stmt_update, 2, (double)global_freq) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 3: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}

		if ( sqlite3_bind_text( stmt_update, 3, sqlite3_column_text(statement, 0), -1,
					SQLITE_STATIC) != SQLITE_OK ) {
			printf("compute_sip() : Error binding 4: %s\n", sqlite3_errmsg(*conn));
			close_sqlitedb( &conn );
			exit(1);
		}

		sqlite3_step( stmt_update );

		sqlite3_reset( stmt_select );
		sqlite3_reset( stmt_update );
	}
	sqlite3_exec(*conn, "commit", NULL, NULL, NULL );

	sqlite3_finalize( statement );
	sqlite3_finalize( stmt_select );
	sqlite3_finalize( stmt_update );
}

void make_sipword( char *team_list, char *database_name)
{
	FILE *fp_wiki, *fp;
	float nbr_of_words = 0;
	char word[MAXBUFFER], digest_name[MAXBUFFER-LEN_CMD], team_name[MAXBUFFER-LEN_CMD];
	char sqlite_query[MAXBUFFER];
	int size_name;

	sqlite3_stmt *stmt_select, *stmt_insert, *stmt_update;

	sqlite3 *conn;
	if ( sqlite3_open( database_name, &conn ) != SQLITE_OK )
	{
		printf("Error opening the db: %s\n", sqlite3_errmsg(conn));
		exit(1);
	}
	
	fp = fopen(team_list, "r");

	if ( fp == 0 ) {
		printf("Can't open %s file\n", team_list);
		exit(1);
	}

	while( !feof(fp) )
	{
		fgets( team_name, (MAXBUFFER - LEN_CMD), fp );

		if ( feof(fp) ) {
			break;
		}

		size_name = strlen( team_name );
		team_name[size_name - 1] = '\0';
		chgchar( team_name, '-', '_', size_name);

		printf("On the %s Team\n", team_name);

		// pre-compile the query to optimize the process
		snprintf( sqlite_query, MAXBUFFER, "SELECT * FROM %s WHERE word=?",
			team_name);

		if ( sqlite3_prepare_v2(conn, sqlite_query , -1,
			&stmt_select, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}

		snprintf( sqlite_query, MAXBUFFER, "INSERT INTO %s \
			VALUES (?, 1, NULL, NULL)", team_name);

		if ( sqlite3_prepare_v2(conn, sqlite_query, -1,
			&stmt_insert, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}
	
		snprintf( sqlite_query, MAXBUFFER, "UPDATE %s \
			SET global_occ = global_occ + 1 WHERE word=?", team_name);

		if ( sqlite3_prepare_v2(conn, sqlite_query, -1,
			&stmt_update, NULL ) != SQLITE_OK )
		{
			printf("Error compiling the request: %s\n", sqlite3_errmsg(conn) );
			goto CLOSE_DB;
			exit(1);
		}
	
		if ( chdir(team_name) == -1 ) {
			printf("error changing directory to %s\n", team_name);
			exit(1);
		}

		strncpy( digest_name, team_name, MAXBUFFER-LEN_CMD-4 );
		strcat( digest_name, ".xtr");

		fp_wiki = fopen( digest_name, "r" );
		if( fp_wiki == 0 ) {
			printf("error when you open %s file", digest_name);
			exit(1);
		}

		if ( chdir("..") == -1 ) {
			printf("error changing directory to ..\n");
			exit(1);
		}


		sqlite3_exec(conn, "begin", NULL, NULL, NULL );
		while ( !feof(fp_wiki) )
		{
			read_next_word( word, fp_wiki );
			if ( is_word_valid( word ) == FALSE ) {
				continue;
			}
			if ( is_word_exists( word, &stmt_select, &conn ) == FALSE ) {
				add_to_the_list( word, &stmt_insert, &conn );
			}
			else {
				inc_occ( word, &stmt_update, &conn );
			}
		
			nbr_of_words++;
		}
		sqlite3_exec(conn, "commit", NULL, NULL, NULL );
		fclose(fp_wiki);
		sqlite3_finalize( stmt_select );
		sqlite3_finalize( stmt_insert );
		sqlite3_finalize( stmt_update );

		compute_freq( &conn, nbr_of_words, team_name );
		compute_sip( &conn, team_name);
	}

CLOSE_DB :
	if ( sqlite3_close(conn) != SQLITE_OK ) {
		printf("Error closing the db: %s\n", sqlite3_errmsg(conn));
	}
}

int main( int argc, char *argv[] )
{

	if ( argv[TEAM_NAME] == 0 || argv[DATABASE_NAME] == 0 ) {
		printf("usage : %s [list of team name] [name of database] [year]\n \
	- don't put [year] if you want to skip the download step.\n", argv[0] );
		exit(1);
	}

	if ( argv[YEAR] == 0 ) {
		printf("skip the downlad step !\n");
	} else {
		printf("start to download wiki !\n");
		download_wiki( argv[TEAM_NAME], argv[YEAR] );
	}

	printf("make the database !\n");
	make_database( argv[TEAM_NAME], argv[DATABASE_NAME] );

	printf("start to make a dictionary !\n");
	make_dictionary( argv[TEAM_NAME], argv[DATABASE_NAME] );

	printf("start calculate SIP words !\n");
	make_sipword( argv[TEAM_NAME], argv[DATABASE_NAME] );

	return 0;
}