Can you improve the speed of my CSV Reader and Writer?

stef_fr · ‎01-13-2015

Hi,

I am glad to see that you have resolved all your issues.

Concerning column number limitation, may be you can do the test in the while condition :

while(*carattere && !error && (stringa_in_corso<numero_stringhe)) ==> it will stop the parsing as soon as the column number is reached

Regards,

Stef

Labwindows/CVI user since version 4.0

holly7787 · ‎01-13-2015

i've done it this way because i had already tried it with the old version and there was this problem:

if i have a special column with CR LF inside and i don't parse it because i've reached the maximum readable column when i will parse the next line i will get an error because i'm reading a "inquote" value

Davide Vittorio G. - TLGB S.R.L.
Italian SW Developer

holly7787 · ‎01-15-2015

i've done some correction to handle correctly the inquote column with CR LF inside it. this seem to be the final reader version.

commented there is the old version.

i have done also a column counter, maybe it's useful to allocate the destination string

int leggi_riga_csv_v2(char **lines, int riga_partenza, char *stringa_destinazione[], int numero_stringhe, int formato)
{
	char delimitatore[2] = {',',';'};

	int stringa_in_corso = 0;
	int index_stringa_in_corso = 0;				  
	
	int inquote = 0;
	int i = 0;
	
	int error = 0;
	char *carattere = NULL;
	
	for(i = 0; i < numero_stringhe; i++){
		stringa_destinazione[i][0]=0;
	}
	
	//Point to begining of current line
	carattere = lines[riga_partenza];
	index_stringa_in_corso = 0;
	
	//while(*carattere && !error)
	while(!error)
	{
		switch(*carattere){
			case '\"':
				if(index_stringa_in_corso == 0){
					// se come primo carattere ho una " allora e' una stringa speciale
					inquote = 1;
					//carattere++; //skip quote
					carattere++; //get next character
				}
				else{
					if(inquote){
						//Check for double quote
						carattere++;         
						if(*carattere == '\"'){
							if(stringa_in_corso < numero_stringhe){
								stringa_destinazione[stringa_in_corso][index_stringa_in_corso++] = *carattere; //concatenate quote
							}
							carattere++; //skip quote
						}
						else if((*carattere == 0) || (*carattere == '\r') || (*carattere == '\n')){
							//end of quoted string
							if(stringa_in_corso < numero_stringhe){
								stringa_destinazione[stringa_in_corso][index_stringa_in_corso] = 0; //terminate string
							}
					
							error = 1; //reached end of line
						}
						else if(*carattere == delimitatore[formato]){
							if(stringa_in_corso < numero_stringhe){
								stringa_destinazione[stringa_in_corso][index_stringa_in_corso] = 0; //terminate string
							}
					
							//parse next string
							stringa_in_corso++;
							index_stringa_in_corso=0;
							carattere++; //get next character
						}
						else{
							error = 1; //Quote string not followed by delimiter or end of string !
						}
						/*
						else if((*carattere != 0) && (*carattere != delimitatore[formato])){
							error = 1; //Quote string not followed by delimiter or end of string !
						}
						else{
							//end of quoted string
							if(stringa_in_corso < numero_stringhe){
								stringa_destinazione[stringa_in_corso][index_stringa_in_corso] = 0; //terminate string
							}
					
							//parse next string
							stringa_in_corso++;
							index_stringa_in_corso=0;
						}
						*/
					}
					else
						error = 1; //Quote inside unquoted string ! 
				}
			break;
			
			case ',':
				if(formato == 1){
					if(stringa_in_corso < numero_stringhe){
						stringa_destinazione[stringa_in_corso][index_stringa_in_corso++] = '.'; //replace , by .
					}
				}
				else
				{
					if(stringa_in_corso < numero_stringhe){
						stringa_destinazione[stringa_in_corso][index_stringa_in_corso] = 0; //terminate string
					}
					
					//parse next string
					stringa_in_corso++;
					index_stringa_in_corso=0;
				}
				
				carattere++;    
			break;
			
			case ';':
				if(formato == 0){
					if(stringa_in_corso < numero_stringhe){
						stringa_destinazione[stringa_in_corso][index_stringa_in_corso++] = ';';
					}
				}
				else
				{
					if(stringa_in_corso < numero_stringhe){
						stringa_destinazione[stringa_in_corso][index_stringa_in_corso] = 0; //terminate string
					}
					
					//parse next string
					stringa_in_corso++;
					index_stringa_in_corso=0;
				}
				
				carattere++;    
			break;
			
			case '\0': // Terminatore
			case '\r': // CR = 0x0D = 13
			case '\n': // LF = 0x0A = 10
				if(inquote){
					//cariage return or line feed inside quote
					//insert line break and parse next line
					if(stringa_in_corso < numero_stringhe){
						stringa_destinazione[stringa_in_corso][index_stringa_in_corso++] = '\r';
						stringa_destinazione[stringa_in_corso][index_stringa_in_corso++] = '\n';
					}
					
					carattere = lines[++riga_partenza];
				}
				else{
					//end of line
					if(stringa_in_corso < numero_stringhe){
						stringa_destinazione[stringa_in_corso][index_stringa_in_corso] = 0; //terminate string
					}
					error = 1; //reached end of line
				}
			break;
			
			default:
				//copy others characters in destination string
				if(stringa_in_corso < numero_stringhe){
					stringa_destinazione[stringa_in_corso][index_stringa_in_corso++] = *carattere;
				}
				carattere++;    
			break;
		}
	}
	
	return riga_partenza;
}

int contacolonne_csv(char **lines, int formato)
{
	char delimitatore[2] = {',',';'};

	int numero_colonne = {0};
	int index_stringa_in_corso = {0};	
	int riga_partenza = {0};
	
	int inquote = 0;
	
	int error = 0;
	char *carattere = NULL;
	
	//Point to begining of current line
	carattere = lines[riga_partenza];
	
	while(!error)
	{
		switch(*carattere){
			case '\"':
				if(index_stringa_in_corso == 0){
					// se come primo carattere ho una " allora e' una stringa speciale
					inquote = 1;
					carattere++; //get next character
				}
				else{
					if(inquote){
						//get next character
						carattere++;         
						if(*carattere == '\"'){ //Check for double quote
							index_stringa_in_corso++;
							carattere++; //get next character
						}		
						else if(*carattere == 0){
							//end of quoted string
							numero_colonne++;
							error = 1; //reached end of line
						}
						else if(*carattere == delimitatore[formato]){
							//end of quoted string
							//parse next string
							carattere++; //get next character
							numero_colonne++;
							index_stringa_in_corso=0;
						}
						else{
							error = 1; //Quote string not followed by delimiter or end of string !
						}
					}
					else
						error = 1; //Quote inside unquoted string ! 
				}
			break;
			
			case ',':
				if(formato == 1){
					// non fare niente
					index_stringa_in_corso++;
				}
				else
				{
					//end of string
					//parse next string
					numero_colonne++;
					index_stringa_in_corso=0;
				}
				carattere++;    
			break;
			
			case ';':
				if(formato == 0){
					// non fare niente
					index_stringa_in_corso++;
				}
				else
				{
					//end of string
					//parse next string
					numero_colonne++;
					index_stringa_in_corso=0;
				}
				carattere++;    
			break;
			
			case '\0': // Terminatore
			case '\r': // CR = 0x0D = 13
			case '\n': // LF = 0x0A = 10
				if(inquote){
					//cariage return or line feed inside quote
					// non fare niente
					index_stringa_in_corso++;
					index_stringa_in_corso++;
					carattere = lines[++riga_partenza];
				}
				else{
					//parse next string
					numero_colonne++;
					error = 1; //reached end of line
				}
			break;
			
			default:
				// non fare niente
				index_stringa_in_corso++;
				carattere++;    
			break;
		}
	}
	
	return numero_colonne;
}

Davide Vittorio G. - TLGB S.R.L.
Italian SW Developer

stef_fr · ‎01-15-2015

Hi holly7787,

The best could be to allocate destination strings when parsing a line.

Each time you encounter a new separator you allocate a new string ... then your function can return allocated strings and the number of columns found.

Next part of your code will process strings and free allocated strings.

This will avoid to walk through the whole twice (to count columns and then parse lines).

Last optimization, line splitting is also not required you can just put the content of all your file in a buffer and pass the buffer pointer and the last character offset to your function.

Something like this :

int leggi_riga_csv_v3(char *buffer, int *offset ...

*carattere will become buffer[*offset]

carattere++ will become offset++

Best regards,

Stef

Labwindows/CVI user since version 4.0

LabWindows/CVI

Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?

Rif.: Can you improve the speed of my CSV Reader and Writer?