Beginners programming challenge #24

    Re: Beginners programming challenge #24


    Edit 9 Change a lot code to get rid of globals and added -l sentences and length and average length
    Edit 8 fixed all memory leaks

    * Name        : 24challange.cpp
    * Author      : Highspider
    * Version     : 9 - sentences length
    * Description: Write a program that analyzes input from a file, essay.txt, and
    *                   compiles statistics on it. The program should output:
    *                    1. The total word count
    *                    2. The count of unique words
    *                    3. The number of sentences
    * Changes  :        Added Display mode -l
    *                   Removed -w mode (strange debug because had trouble with
    *                     step into-step out- debuging)
    *                   Got rid of all non-const globals vars
    *                   Made funcs cleaner -- had to without globals
    *                   Fixed all memory leaks
    *                   Added two more dynamic arrays for sim words and there count
    *                   Added A bubble sort for similar words order
    *                   Added Display Mode -s.
    *                   Added Display Mode -p
    *                   Use command line arg's for a cookie points.
    *                   Added Display Mode -d and -n
    #include <cstring>      //for strcmp strlen strcpy strcmp
    #include <iostream>     //for Std in out
    #include <stdlib.h>     //for atoi
    #include <fstream>      //for files
    using std::cout;        //iostream: std out
    using std::endl;        //iostream: end line + return
    using std::ifstream;    //fstream:  open read only mode
    //const globals 
    const int MAX = 256;        //MAX buffer size
    //funcs Declarations
    int GetWords(char buffer[MAX], char **&words, ifstream &wordfile, int &num_words,
                 char **&sent, int&num_sent);
    int SimWords(char ** words, int num_words, char **&simwords, int &num_simwords,
                 char **&simcount, int &num_simcount);
    void Phrases(char buffer[MAX], char ** words, char **&phrases, int &num_phrases,
                 int num_words);
    void avgslen(char ** sent, int num_sent, char ** words, int num_words);
    void DisplayWords(char ** words, int num_words, bool mode);
    void DisplaySimWords(char ** simwords,char ** simwordscount, int num_simwords);
    void sort(char **&simwordscount, char **&simwords, int num_simwordscount );
    void DynamicAllocate(char buffer[MAX], char **&temp, char **&words, int & num_words);
    void Deallocate(char **&words,  int num_words);
    //main with commandline args
    int main (int argc, char* argv[])
        char buffer[MAX];           //buffer place holder used for getline() and strcmp()
        int sentence  = 0;          //count of sentences
        int similar = 0;            //count of found similar words
        //Pointer of pointers and there count for dynamic allocating
        int num_words = 0;          //count of words listed in the array words
        char ** words = 0;          //pointer of pointers for the word list array
        int num_sent = 0;           //count number of sentences
        char ** sent = 0;           //pointer of pointers for sentences counts
        int num_simwords = 0;       //count of simwords listed in the array
        char ** simwords = 0;       //pointer of pointer for similar words
        int num_simcount = 0;       //count of counters put on similar words
        char ** simcount = 0;       //pointer of pointer for the count of similar words
        int num_phrases = 0;        //count of words listed in the array phrasewords
        char ** phrases = 0;        //pointer of pointers for the phrase list array
        //check for command line args
        if ( argc > 1 && argc < 4 )
            //open the word list
            ifstream wordfile;
            //wordfile is [1] from command line
   ( argv[1] );
            //if file is found and opened
            if (wordfile.is_open() ){
                //call func getwords and sentence count
                sentence = GetWords(buffer, words, wordfile, num_words, sent, num_sent);
                //close the word file
                //call func get similar words and the count of them
                similar = SimWords(words, num_words, simwords, num_simwords,
                                    simcount, num_simcount);
                //set bool Display modes  for DisplayWords from argv[2]
                if ( argc == 3){
                    if(!strcmp( *(argv + 2 ), "-d" ))
                       DisplayWords(words, num_words, true);
                    else if(!strcmp( *(argv + 2 ), "-n" ))
                      DisplayWords(words, num_words, false);
                    else if(!strcmp( *(argv + 2 ), "-s" )){
                        //sort list before diplaying it
                        sort(simcount, simwords, num_simcount);
                        //display list and have x for analyze
                        DisplaySimWords(simwords, simcount, num_simwords);
                    else if(!strcmp( *(argv + 2 ), "-p" )){
                        Phrases(buffer, words, phrases, num_phrases, num_words);
                        //Display the phrases
                        DisplayWords(phrases, num_phrases, true);
                        //Deallocate memory for phrases
                        Deallocate(phrases, num_phrases);
                        delete [] phrases;
                    //-l avgerage sentences length and their words
                    else if(!strcmp( *(argv + 2 ), "-l" ))
                        avgslen(sent, num_sent, words, num_words);
                 }//end if command line arguments
                //Display the words
                //display header for analyze first
                cout << "analyze\n"
                 << "Total word count: "<< num_words << endl
                 << "Unique words: " << num_words - similar << endl
                 << "Sentences: " << sentence << endl;
                //Deallocate memory for words array
                Deallocate(words, num_words);
                delete [] words;
                //Deallocate memory for similar words array
                Deallocate(simwords, num_simwords);
                delete [] simwords;
                //Deallocate memory for similar words count numbers array
                Deallocate(simcount, num_simcount);
                delete [] simcount;
                //Deallocate memory for sentences length count
                Deallocate(sent, num_sent);
                delete [] sent;
            //if their is an error opening the file
                cout << "\n!Error!\n!Error!\t" << argv[1]
                     << "\n!Error!\tFile not found or unable to open\n!"
                     << "Error!\n" << endl;;
                //Display Usage if no command line args or to many args
                cout << "\n\nUsage: " << argv[0] << " <filename> [ arg ]\n"
                << "\t\t -d:  (d)isplay words of the file\n"
                << "\t\t -n:  display words (n)ot formated\n"
                << "\t\t -s:  display (s)imilar words\n"
                << "\t\t -p:  display similar (p)hrases\n"
                << "\t\t -l:  display average (l)ength of sentences\n"
                << endl;
    }//end main
    *NAME: GetWords()
    *PURPOSE: Get words out of a text file and build array for words
    *PARAM: char **&words, ifstream the text file, num_words
    *        all by ref, bool watch, buffer by value
    *Return: int total number of sentences found
    int GetWords(char buffer[MAX], char **&words, ifstream &wordfile,
            int &num_words, char **&sent, int&num_sent)
        char ** temp  = 0;       //temp for 3-way swap dynamic allocation
        int buflen = 0;          //temp for buffer len
        int sentences = 0;       //return for sentences
        char tempbuff[MAX];
            //read from text file and use ' 'a blank space for delimator
            wordfile.getline(buffer,256,' ');
            //how many chars are in the buffer[]
            buflen = strlen(buffer);
            //loop for each char in buffer[]
            for (int i = 0; i < buflen; ++i){
                //change to upper case to lower case
                if (buffer[i] >= 65 && buffer[i] < 90)
                    buffer[i] = buffer[i] + 32;
                //get rid of commas,
                else  if ( buffer[i] == 44 )
                       buffer[i] = '\0';
                //if start "quote
                else if (buffer[i] == 34 && buflen > i + 1){
                    //push bach char the start quote ["quote/0] to be [quote/0]
                    for (int s = 0; s < buflen; ++s)
                        buffer[s] = buffer[s + 1];
                        //bufer len change
                        buflen = strlen(buffer);
                        //set back the main loop to reloop;
                        i = -1;
                //count sentence and add terminator char in place
                //46 = . 33 = ! 63 = ?
                else if ( buffer[i] == 46 || buffer[i] == 33 || buffer[i] == 63 ){
                     //copy the count of words to a temp buffer
                     sprintf(tempbuff, "%d", num_words);
                     //build array of these counts to use for sentence lengths
                     DynamicAllocate(tempbuff, temp, sent, num_sent);
                     buffer[i] = '\0';
                //NOT 13 a carriage return its 10 newline
                else if (buffer[i] == 10){
                    buffer[i] = '\0';
                    //build the extra array
                    DynamicAllocate(buffer,temp, words, num_words);
                    //there may be one two or more newlines
                    do {
                    } while ( buffer[i] == 10 );
                    //copy the new word over
                    for (int s = 0; s < buflen; ++s)
                        buffer[s] = buffer[i + s];
                        buflen = strlen(buffer);
                    //Reloop the main loop to check the new word
                    i = -1;
            }//end for loop
            //build arrays
            DynamicAllocate(buffer,temp, words, num_words);
        }while ( !wordfile.eof()); //end of do while loop end of file
        return sentences;
    *NAME: SimWords()
    *PURPOSE: Check for similar words
    *PARAM: char ** words, int num_words by value. char simwords,
    *     : char num_simwords, char simcount, int num_simcount ref
    *RETURN: int total number of simwords found
    int SimWords(char ** words, int num_words, char **&simwords, int &num_simwords,
            char **&simcount, int &num_simcount){
       char ** temp  = 0;     //temp for dynamic allocate
       int number = 0;        //temp for converting string to int back to string
       int count = 0;         //total count of similar words
       bool addnew = true;    //bool true/false flag for adding new entries
       char GE[] = "2     ";  /* OVERSIZED array default number used simcount
                              *  if not oversized program leaks memory when changing
                              *  values of char** simcount
       //loop m main for each word in the list
       for (int m = 0; m < num_words; m++ ){
           //loop s sub to compare with words left in the list
           for (int s = m + 1; s < num_words; s++){
             //if they are the same word
             if (!strcmp(words[m], words[s] )){
                //add a new entry (might change later)
                addnew = true;
                //add one to similar words found
                //check if allready in the list
                for (int ct = 0; ct < num_simwords; ct++){
                   //if same word already in simwords list
                   if(!strcmp( words[m], simwords[ct])) {
                      //update that entries count
                      //cast array to int and add one to the count
                      number = atoi( simcount[ct] ) + 1;
                      * sprintf() is used as itoa
                      * Strange Linux version of itoa
                      * itoa is windows proprietary
                      * It took a long to figure this out
                      *///update the entry with new count
                      sprintf(simcount[ct], "%d", number);
                     //set the bool flag that entry has been edited
                     addnew = false;
                     //break this CT Count Twice loop
                     ct = num_words;
                //if no odd entry was updated bool flag is true
                //and we are adding new entry
                if (addnew){
                   DynamicAllocate(words[m], temp, simwords, num_simwords);
                   DynamicAllocate(GE, temp, simcount, num_simcount);
                 //break the compare words left list loop
                //to prevent over reads
                s = num_words;
             }//end if same word
          }//end for() s
       }//end for() m
       return count;
    }//end func
    *NAME phrases()
    *PURPOSE: Find similar phrases in a list of words
    *PARAM:    char buffer, char words, int number by value, char phrase
    *     :   int num_phrase by ref
    void Phrases(char buffer[MAX], char **words, char **&phrases, 
                         int &num_phrases, int num_words){
        char buffer2[MAX];        //temp buffer for compareing words
        char ** temp;             //temp need for allocate 3-way swap
        bool addnewflag = false;  //add a new entry true or false
        cout << "phrases" << endl;
        for ( int i = 0; i < num_words - 2; ++i ){
            addnewflag = true;  // t/f for add new entry to found array
            //build a buffer of 3 words
            strcpy(buffer, words[i]);
            strcat(buffer," "); //add spaces for display reasons
            strcat(buffer, words[i + 1]);
            strcat(buffer," ");
            strcat(buffer, words[i + 2]);
            //loop a buffer of the next 3 words
            for ( int l = i + 2; l < num_words - 2; ++l ){
                //build buffer3 of the next 3 words
                strcpy(buffer2, words[l]);
                strcat(buffer2," ");
                strcat(buffer2, words[l + 1]);
                strcat(buffer2," ");
                strcat(buffer2, words[l + 2]);
                //if match of the 3 words
                if(!strcmp( buffer, buffer2 )){
                    //loop the array of phrases already found
                    for (int f = 0; f < num_phrases; f++){
                    //if phrase is already in - don't add it again
                        if(!strcmp( buffer, *(phrases + f) )){
                            addnewflag = false;
                    //if not updated entry flag is true add it
                    if (addnewflag){
                        DynamicAllocate(buffer, temp, phrases, num_phrases);
            }//end l loop
        }//end i loop
    }//end func
    *NAME: avgslen();  average sentence length
    *PURPOSE: find average length of sentences
    *PARAM: char ** sent,words, int num_sent,num_words by value
    *RETURN: none
    void avgslen(char ** sent, int num_sent, char ** words, int num_words){
        float average = 0;
        //display length of first sentence out side of the loop
        //if there is at least one
        if ( num_sent > 0 ){
          //the first count [x]
          cout << "\n\n1) [" << atoi(sent[0]) +1 << ']';
          //the sentence words inside that count
          for (int l = 0; l <= atoi(sent[0]); l++)
              cout << ' ' << words[l];
          //cleaner display
          cout << endl;
          //add to the average
          average =    atoi(sent[0]) + 1;
          //loop the remaining
          for (int i = 1; i < num_sent; i++){
            //the next counter [x]
              cout << i + 1 << ") ["
                 << atoi(sent[i]) - atoi(sent[i - 1]) << ']';
            //the words inside that count
            for (int l = (atoi(sent[i - 1]) + 1); l <= atoi(sent[i]); l++)
                      cout << ' ' << words[l];
            //cleaner display
            cout << endl;
            //calc the total for average length
            average = average + (atoi(sent[i]) - atoi(sent[i - 1]));
          //divid to find average
          average = average / num_sent;
          //display the results
          cout << "\nThe average sentence length is: ["
               << average << "] words long\n\n";
    *NAME: DisplayWords()
    *PURPOSE: cout << display words to screen
    *PARAM: char ** words, int num_words bool mode by value
    void DisplayWords(char ** words, int num_words, bool mode){
        // bool not formated display
        if (!mode){
            for ( int i = 0; i < num_words; ++i )
            cout << *(words + i);
        // else formated display
            for ( int i = 0; i < num_words; ++i )
            cout << '\t' << i + 1 << '[' << *(words + i) <<  ']' << endl;
        cout << endl;
    *NAME: DisplaySimWords()
    *PURPOSE: display similar words and corresponding count to screen
    *PARAM: char ** simwords, char ** simwordscount, num_words by value
    void DisplaySimWords(char ** simwords, char ** simwordscount, int num_simwords){
        //Display header
        cout << "\n\n\t\tDescending frequency most used words:\n\n";
        //loop and display both word and its count
        for ( int i = 0; i < num_simwords; ++i){
            cout << i + 1 << "  [" << *(simwords + i) <<"] * "
                 << *(simwordscount + i) << endl;
    *NAME sort(char **&simwordscount, char **&simwords, int num_simwordscount )
    *PURPOSE: Resort the simwords arrays by frequency
    *PARAM:   char ** simwordscount, char ** simwords by ref,
    *     :   int num_simwords by value
    *NOTE:    Bubble sort algorithm.
    void sort(char **&simwordscount, char **&simwords, int num_simwordscount )
        char * temp;  //place holder for 3-way swap
        //nested loops NOT while(FLAG)
        for ( int i = 0; i < num_simwordscount; i++)
                for ( int i = 0; i < num_simwordscount - 1; i++)
                     //if less than then swap
                     //this way back-words compared to how others do it with while flags
                    if ( atoi(simwordscount[i]) < atoi(simwordscount[i + 1])) {
                     //3-way swap
                     //sort for simwordscount count up one cell in the array
                     temp = simwordscount[i];
                     simwordscount[i] = simwordscount[i + 1];
                     simwordscount[i + 1] = temp;
                     //3-way swap
                     //sort for simwordscount the corresponding i in simwords
                     //to match cells of the array
                     temp = simwords[i];
                     simwords[i] = simwords[i + 1];
                     simwords[i + 1] = temp;
    *NAME: DynamicAllocate()
    *PURPOSE: Build arrays dynamically
    *PARAM: char buffer by value, char temp, char words,num_words by ref
    void DynamicAllocate(char buffer[MAX], char **&temp, char **&words,
            int &num_words){
        //pointer of pointers    Notice NEW char * []
        temp = new char * [num_words + 1];    //temp for word array
        //copy all previously entered words to our temp array
        //note first time loop doesn't runs i=0 < num_words=0
        //"(0<0)not true"
        for ( int i = 0; i < num_words; i++){
            temp[i] = words[i];
        //make new temp array the size of the string
        temp[num_words] = new char[strlen( buffer ) + 1];
        //copy buffer to temp
        strcpy( temp[num_words], buffer );
        //deallocate old words
        delete [] words;
        //copy address of temp to words
        words = temp;
        //add one to number entries 
    *NAME: Deallocate()
    *PURPOSE: Delete / de-allocate memory used by the program
    *PARAM: char words by ref, int num_words by value
    void Deallocate(char **&words, int num_words){
            //delete | de-allocate each array of words two star char **
            for ( int i = 0; i < num_words; ++i)
                delete [] words[i];
    Usage: ./24challange <filename> [ arg ]
             -d:  (d)isplay words of the file
             -n:  display words (n)ot formated
             -s:  display (s)imilar words
             -p:  display similar (p)hrases
             -l:  display average (l)ength of sentences

    ./24challenge /home/username/Desktop/essay.txt

    Total word count: 468
    Unique words: 223
    Sentences: 38
    ./24challenge /home/username/Desktop/essay.txt -s
    Descending frequency most used words:
    1  [cats] * 25
    2  [the] * 20
    3  [a] * 15
    Total word count: 468
    Unique words: 223
    Sentences: 38

    ./24challenge /home/username/Desktop/essay.txt -p

        1[a cat is]
        2[are civilized members]
        3[civilized members of]
    ./24challenge /home/username/Desktop/essay.txt -d
    ./24challenge /home/username/Desktop/essay.txt -l
    1) [6] a dog is man's best friend
    2) [19] that common saying ... <skip>
    3) [9] for many people a cat is their best friend
    38) [8] in many ways cats are the ideal housepet
    The average sentence length is: [12.3158] words long
    Last edited by highspider; January 6th, 2012 at 08:32 AM.
    In a perfect world my college professors would allow assignment as .odt files! And code as Eclipse projects.

    Re: Beginners programming challenge #24

    This is my entry in D. I tried to make the code as easy to read as possible.

    It only finds phrases that are 3 words long, the rest is OK. It accepts the filename as a first argument.

    If you have 'dmd' installed then just use the command below to run the program.

    ./program.d file.txt

    PHP Code:
    #!/usr/bin/rdmd -w
    import std.stdio;
    import std.array;
    import std.regex;
    import std.string;
    import std.algorithm;
    import std.conv;
    import std.file;
    pragma(msg"Compiling, please wait...");

    //returns array of unique words
    string[] Unique(string[] arr)
    int count 0;
    string[] unique;

    count 0;
    //compare all entries
    if(icmp(s2s) == 0count += 1;
    //count equals 0 if the word does not yet exist in "unique"
    if(count == 0)
    unique ~= s;

    //writes a list of commonly used words
    //"amount" limits the output of function
    void List(string[] arrstring[] uniqueint amount)
    int count 0;
    string[] hits_s;
    count 0;
    //count how many times a word repeats
    if(icmp(s2s) == 0count += 1;
    //add words to array
    if(count 10hits_s ~= ("0" text(count) ~ " - " s);
    hits_s ~= (text(count) ~ " - " s);
    sort!("a > b")(hits_s);
    int i 0hits_s.lengthi++)
    == amount) break;

    void Phrases(string[] arr)
    string[] phrases;
    string tmp;
    string tmp2;
    int count 0;
    int length arr.length;
    bool exists false;
    //save phrases of 3 words into array
    foreach(int istring sarr)
    length-2//stop before reaching end of array
    tmp = (arr[i] ~ " " arr[i+1] ~ " " arr[i+2]); //make a phrase    
    else break;    
    count 0;
    exists false;
    int jstring s2arr//compare each phrase to whole text
    tmp2 = (arr[j] ~ " " arr[j+1] ~ " " arr[j+2]);
                else break;
    icmp(tmptmp2) == 0count += 1;
    count >= 3
    //if phrase already exists in array, don't add it
    icmp(s2tmp) == 0exists true;
    existsphrases ~= tmp;
    writeln(s); //write them out

    int main(string[] args)
    File file;
    //open file specified in argument
    if(args.length 1)
    exists(args[1])) file File(args[1], "r");
    writeln("Incorrect file name or file doesn't exist!");
    writeln("First argument must not be empty!");
    char[] buf;
    string[] matches;
    char[] content;
    int count_w 0;
    int count_s 0;
        while (
    file.readln(buf) != 0)
    buf tr(buf"\n"" "); //remove new lines from text and add it to array
    content ~= buf;    
    mmatch(contentregex(`[\w']+`))) //matches any word character and ' repeated one or more times
    count_w += 1;
    matches ~= cast(string)toLower(m.hit);
    mmatch(contentregex(`[.?!]`))) //matches all . and ? and ! - sentences
    count_s += 1;
    //Writing it all out
    string[] unique Unique(matches);
    writefln("Total word count: %s"count_w);
    writefln("Unique words: %s"unique.length);
    writefln("Sentences: %s"count_s);
    count_s != 0writefln("Average sentence length: %s"count_w count_s);
    writefln("Average sentence length: %s"count_w);
    writeln("Common words:");
    writeln("Common phrases:");

    Sample output from a modified text file:
    Compiling, please wait...
    Total word count: 563
    Unique words: 226
    Sentences: 38
    Average sentence length: 14
    Common words:
    35 - lala
    25 - cats
    24 - kaj
    20 - the
    15 - a
    13 - are
    12 - they
    12 - of
    10 - to
    Common phrases:
    kaj kaj kaj
    cats can be
    lala lala lala
    Your left hand is touching your face.

    Re: Beginners programming challenge #24

    Unless anyone asks for an extension, this challenge will be judged on January 7th.
    Re: Beginners programming challenge #24

    I'm sorry, but I was hit with an unexpected schedule change today and I was unable to judge the challenge. I will attempt to do so tomorrow. If I still don't manage it, I may ask someone else to do it.
    Re: Beginners programming challenge #24

    HIGHSPIDER wins for his clean code, implementation of all cookie points, excellent commenting, and that I was able to compile and run his program on Windows. That saved me a good amount of time.

    I may or may not hand out assorted extra awards later. We'll see. In the meantime, congratulations to highspider.

    Re: Beginners programming challenge #24

    Using flex and a bit of C

    Compile and run with:
    flex lexer.lex
    gcc -c set.c
    gcc lex.yy.c set.o -lfl
    /* UFPC 24 */
    %option nodefault
    #include "set.h"
      int words = 0;
      int uwords = 0;
      int sent = 0;
    [a-zA-Z]+ {
      uwords += put(yytext);
    [\.?!] {
    .|\n {
    main(int argc, char** argv) {
      if(argc > 1) {
        if(!(yyin = fopen(argv[1], "r"))) {
          return 1;
      printf("words: %d\n", words);
      printf("uwords: %d\n", uwords);
      printf("sents: %d\n", sent);
      return 0;
    #ifndef SET_H
    #define SET_H
    #define TABLESIZE 2048
    char* table[TABLESIZE];
    void init_set();
    int put(char*);
    int contains(char*);
    int count();
    #include <string.h>
    #include "set.h"
    init_set() {
      size_t i;
      for(i=0; i<TABLESIZE; i++)
        table[i] = NULL;
    contains(char* s) {
      size_t i = 0;
      while(i<TABLESIZE && table[i]!=NULL) {
        if(strcmp(table[i], s) == 0)
          return 1;
      return 0;
    put(char* s) {
      char* dup = strdup(s);
      if(!contains(dup)) {
        size_t i=0;
        while(i<TABLESIZE) {
          if(table[i]==NULL) {
            table[i] = dup;
            return 1;
      } else
        return 0;
    count() {
      int n = 0;
      size_t i = 0;
      while(i<TABLESIZE) {
        if(table[i] != NULL)
      return n;
    Please excuse the ghetto hashset implementation

    Re: Beginners programming challenge #24

    I have attempted to make contact with Highspider, but s/he hasn't been online for a week or so. If I haven't received a reply by the 23rd then the beginners team will take over the posting and judging of the next challenge.

    Re: Beginners programming challenge #24

    Well its the 23rd and im respoding. I spent along time codeing this because it was college break for xmas.

    I was away from town and no internet is why I took so long to respond.

    woohoo cool beans.

    The reason it works on both windows and linux is because I wrote it with eclipse on ubuntu and then debuged mem leaks with visual studio c++.

    I final found out that one has to manule install a new version of eclispe "indigo" and then use Valigrid for unbuntu in order to use valgrind memory leak check.
    Re: Beginners programming challenge #24

    I really look forward to programming challenge #25.

    Hope that becomes a reality as soon as possible.
    Re: Beginners programming challenge #24

    highspider has asked me to post the next challenge on his behalf, so keep your RSS readers running, it will be up soon!

