4

This is my C code that reads a list of URLs from a file, and tries to separate the various parts of the URL. This is just rough parsing, I'm not bothered about special cases. I guess there is some fault with the sscanf() statement; when I run this, I get "segmentation FAULT". And moreover, the full url is being assigned to the "proto" string.

#include<stdio.h>
#include<string.h>
#include<stdlib.h>

# define BAG_SIZE 14

char bag[117][30];

void initbag()
{
strcpy(bag[0],"account");
strcpy(bag[1],"audit");
strcpy(bag[2],"ad");
strcpy(bag[3],"advertising");
strcpy(bag[4],"marketing");
strcpy(bag[5],"application");
strcpy(bag[6],"banking");
strcpy(bag[7],"barter");
strcpy(bag[8],"business");
strcpy(bag[9],"econo");
strcpy(bag[10],"commerce");
strcpy(bag[11],"communication");
strcpy(bag[12],"computer");
strcpy(bag[13],"processing");
}
/*
 other bag[] values will be later copied
*/

void substr(char dest[10],char src[200],int start,int len)
{
int i,j;

for(i=start,j=0;i<start+len;i++,j++)
dest[j]=src[i];
dest[j]='\0';

}

int found(char* word)
{
   int i;
   for(i=0;i<BAG_SIZE;i++)
   if((!strcmp(word,bag[i]))||(strstr(bag[i],word)!=NULL)) return 1;
   return 0;
}

void main()
{
int i,j,k;

char buff[10],fullurl[100];
char proto[5],www[4],host[100],tokens[200],tld[4];
float feature[11];for(i=0;i<11;i++) feature[i]=0;
FILE *furl,*fop;
furl=fopen("bizurls.txt","r");
fop=fopen("urlsvm.txt","w");
initbag();
printf("\nbag initialised");fflush(stdout);

while(!feof(furl))
{
   fscanf(furl,"%s",fullurl);
   printf("%s",fullurl);
   sscanf(fullurl,"%s://%s.%s.%s/%s\n",proto,www,host,tld,tokens);// this line isnt working properly
   printf("2hi");fflush(stdout);
   printf("proto : %s\nwww:%s\nhost :%s\ntld:%s\ntokens:%s\n",proto,www,host,tld,tokens);fflush(stdout);


   for( i=4;i<=8;i++)
   {
       for(j=0;j<strlen(host)-i+1;j++)
           {
                substr(buff,host,j,i);
                if(found(buff)) feature[i-3]++;

           }
   }
  if((!strcmp(tld,"biz"))||(!strcmp(tld,"org"))||(!strcmp(tld,"com"))||(!strcmp(tld,"jobs")))   
        feature[0]=1;
  else if((!strcmp(tld,"info"))||(!strcmp(tld,"coop"))||(!strcmp(tld,"net")))
        feature[0]=0.5;
  else
    feature[0]=0;


   for( i=4;i<=8;i++)
   {
       for(j=0;j<strlen(tokens)-i+1;j++)
           {
                substr(buff,tokens,j,i);
                if(found(buff)) feature[i+2]++;

           }
   }

/*.biz · .com · .info · .name · .net · .org · .pro
.aero, .coop, .jobs, .travel */

for(i=0;i<11;i++) fprintf(fop," %d:%f",i,feature[i]);
fprintf(fop,"\n");


}
fflush(fop);
fclose(furl);
fclose(fop);
}
5
  • 1
    Crikey, people have a hard-enough time doing this with a full-powered regex engine. I suspect it will be very difficult with the scanf format string :-) Commented Jan 25, 2010 at 6:24
  • 1
    You should always check the return value of *scanf functions. As KennyTM mentioned, sscanf is the wrong tool for URL scanning. Commented Jan 25, 2010 at 6:25
  • 1
    Can you post more code? How large are your strings? Which instruction segfaults? Commented Jan 25, 2010 at 6:26
  • i've edited to post the whole code.. Commented Jan 25, 2010 at 6:40
  • 1
    Ok, this shows that the segfault is due to proto being only 5 characters big but you are copying a much larger text into it -> buffer overflow. That explains your crash. Otherwise I second to not use sscanf for URL scanning (parsing). Commented Jan 25, 2010 at 6:45

3 Answers 3

3

Many answers here:
Best ways of parsing a URL using C?

Sign up to request clarification or add additional context in comments.

Comments

3

%s in sscanf will only stop when it hits the first white-space character, the end of the string or the specified maximum length. Seeing as an URL has no whitespace, that's the reason why proto becomes fullurl.

For the segmentation fault: as proto can only hold 5 bytes (including the trailing null, so only 4 bytes of data which would not cover e.g. https), putting the full URL into it will cause a buffer overflow / segmentation fault. sscanf is rather problematic in this regards. Documentation requests that each char buffer receiving a %s should be big enough to hold the full string (plus \0).

Comments

1

It won't work because proto would match the whole fullurl and the rest will be unmatched. You should use a proper URL parsing function or regex for this.

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.