0

I have a container (array or vector) and millions of words. I need to sort them in following order s.

The primary sort order should be the number of characters in the word. The secondary sort order should be lexicographical. I can not use any library such as sort. I want to create the algorithms from scratch. I appreciate if anyone can hit me up with any reference.

So sorting the words:

This is a list of unsorted words

should give:

a is of This list words unsorted

Edit:

I am not allowed to use any STL such as sort

//Following is my final program
//It wi be run with following:  args: <inputfile> <outputfile> <timesfile> <ntests>  
//timesfile is for storing times and ntests is for number of test
/*
Bernard Grey
10 Wednesday 10 Sep 2014
*/
#include <iostream>
#include <ctime>
#include <algorithm>
#include <fstream>
#include <cctype>
#include <cstdlib>
#include <cstring>
#include <vector>

using namespace std;

//This node contain two type of information both in the vector
//First is vector for hash function. it contains number of repetition of the word
//Second node contain a word for values in my vector and the other field is for future implementation ;)
struct node
{
    string val;
    int count;
};

//Definition of inner and outer vectors as cintainer of words and hash table
typedef std::vector<node> StringVector;
typedef std::vector<StringVector> StringVector2D;




//Cited at http://stackoverflow.com/questions/8317508/hash-function-for-a-string :In the comment
int HashTable (string word)
{
   int seed = 378551; 
   unsigned long hash = 0;
   for(int i = 0; i < word.length(); i++)
   {
      hash = (hash * seed) + word[i];
   }
   return hash % 1000000;//Later assign it to number of words
}

//Cite at: http://stackoverflow.com/questions/25726530/how-to-find-an-struct-element-in-a-two-dimention-vector
struct find_word
{
    string val;
    find_word(string val) : val(val) {}
    bool operator () ( const node& m ) const
    {
        return m.val == val;
    }
};


//I could use swap function in vector instead of implementing this function
void swap(StringVector& vec, int i, int j)
{
    node tmp = vec[i];
    vec[i] = vec[j];
    vec[j] = tmp;

}

//To compare string alphabetically order
bool comp(node& i,node& p)
{
    int cmp;
    if(i.val.compare(p.val)<0)
    {
        return true;
    }
    return false;
}

void quickSort(StringVector& aVec, int left, int right);
int partition(StringVector& aVec, int left, int right);
void swap(StringVector& aVec, int left, int right);


void quickSort(StringVector& aVec, int left, int right)
{
    if(right>0){
        int index = partition(aVec,left,right);

        if (left<index-1) {

            quickSort(aVec, left, index-1);

        }
        if (index<right) {
            quickSort(aVec, index,right);
        }
    }    
}
int partition(StringVector& aVec, int left, int right)
{

    string pivotNode;

         pivotNode = aVec[(left+right)/2].val;

    while (left<=right) { 

        while (aVec[left].val.compare(pivotNode)<0) {left++;  }  
        while (aVec[right].val.compare(pivotNode)>0) {right--;  }

        if (left<=right) {

           swap(aVec,left,right);
           left++;
           right--;
        }
    }
    return left;
}

//Welcome to Maaaain
int main(int argc, char* argv[])
{

    /*file reading and preprocessing*/
    if(argc != 5)
    {
        cerr << "usage: " << argv[0]  << " infile outfile timesfile ntests" << endl;
    }

    ifstream fin(argv[1]);
    if(fin.fail())
    {
        cerr << "Error: failed to open file " << argv[1]  << " for input" << endl;
        exit(EXIT_FAILURE);
    }
    int ntests = atoi(argv[4]);
    //Len of string and max num word
    int stringlen, numwords;
    get_max_words(fin, stringlen, numwords);
    //initial string
    string init[numwords];


    //Read the file and add it to first array
    for(int i=0; i<numwords; i++)
    {

        string tmp;
        fin >> tmp;
        int len = tmp.length();
        //There is one single ' in the example output file. so I do not want to delete that one :-)
        bool pp = true;
        //Remove punct from leading and tail
        if(len==1)
        {
            pp=false;
        }
        //Remove punc
        if( ispunct(tmp[0]) && pp)
        {
            tmp.erase(0,1);
        }
        //Remove punc
        if( ispunct(tmp[len-1]) && pp)
        {
            tmp.erase(len-1,1);
        }
        init[i] =tmp;
    }

    /*
    At this point, everything should be in the initial array
    The temporary array should be declared but not filled
    */

    clockid_t cpu;
    timespec start, end;
    long time[ntests];
    //2 Dimension vector this will called outer vector
    StringVector2D twoD;



    if(clock_getcpuclockid(0, &cpu) != 0)
    {
        cerr << "Error: could not get cpu clock" << endl;
        exit(EXIT_FAILURE);
    }
    int rep = 0;


    node tmp;
    tmp.count = 0;
    tmp.val = "";
    //Later I need to assign it to number of words * M ... Good for encryption... It is not a security subject
    vector<node> first(1000000,tmp);
    //This is called inner vector
    vector<string> templateVec;
    //Last search?
    bool last = false;

    //Initialize inner map as needed and put it inside the outer vector with no data
    for(int f=0;f<(stringlen);f++)
    {   

        StringVector myVec;
        twoD.push_back(myVec);
    }


    for(int i=0; i<ntests; i++)
    {   

        if(clock_gettime(cpu, &start) == -1)
        {
            cerr << "Error: could not get start time" << endl;
            exit(EXIT_FAILURE);
        }


        //Check if it is last iteration so do not delete data for printing purposeses
        if(i == ntests-1)
        {
            last = true;
        }

        /*copy from initial array to temporary array*/
        //Initialize inner vector with the values. In this point outer vector is filled with inner vector
        //&&&  inner vector is empty  myvec.empty() = true;
        //vector at index 0 is for words with one char... vector 1 is for words with two chars and so on...
        for(int j=0; j<numwords; j++)
        {
            int len = init[j].length()-1;
            if(len<0)continue;

            //Initilize a node to fill up the vector
            node currNode;
            currNode.val = init[j];
            //currNode.count = 0;           

            int hash  =  HashTable(init[j]);
            //Node already existed
            if(first[hash].count != 0){
                //Add to its value in hash table
                first[hash].count++;
            }
            else
            {
                //Activate word first time!
                first[hash].count =1;
                //I can even not use this because of the hash table but it may help in future improvment!!!
                first[hash].val = init[j];
                //Add the word to appropriate level in outer string! 1char == [0] ---  2char== [1] so on
                twoD[len].push_back(currNode);
            }   
        }
        //Sort Alphabetically order
        for(int f=0;f<(stringlen);f++)
        {
            //Eficcient sorting algorithm with no chance of segmentation dump ;)
            quickSort(twoD[f],0,twoD[f].size()-1);          
        }
        //Time finished
        if(clock_gettime(cpu, &end) == -1)
        {
            cerr << "Error: could not get end time" << endl;
            exit(EXIT_FAILURE);
        }

        //Delete items from vector if it is not last iteration --- This is not part of sorting algorithm so it is after clock
        if(!last)
        {
            for(int f=0;f<stringlen;f++)
            {
                twoD[f].clear();
            }
            twoD.clear();

            for(StringVector::iterator it3 = first.begin();it3!=first.end();it3++)
            {
                it3->val="";
                it3->count=0;
            }

            //Initialize inner map as needed and put it inside the outer vector 
            for(int f=0;f<(stringlen);f++)
            {
                StringVector myVec;
                twoD.push_back(myVec);
            }           
        }

        /*time per trial in nanoseconds*/
        time[i] = (end.tv_sec - start.tv_sec)*1000000000 + end.tv_nsec - start.tv_nsec;
    }


    /*output sorted temporary array*/
    int k=0;
    int y =0;
    int num=0;
    ofstream fout(argv[2]); 
    //Pointer for inner vector
    StringVector::iterator it2;
    for (StringVector2D::iterator outer = twoD.begin();  outer != twoD.end();  ++outer){
        y++;
        k=0;
        for (it2= outer->begin(); it2!=outer->end(); ++it2){
            //Get back data from hash table
            int hash  =  HashTable(it2->val);
            //Number of word in other field of the node
            int repWord = first[hash].count;
            //Print according to that
            for(int g=0; g < repWord ;g++){
                    num++;
                    //10 char in one line
                    if(num%10 == 0)
                    {
                        fout << it2->val;
                        fout<<endl;
                        k++;
                    }
                    else
                    {
                        fout<< it2->val << "  ";
                    }
                }
            }
        }
    //Sort times with STL for god sake....
    sort(time,time+ntests);
    //print times to the file///
    ofstream ftimes(argv[3]);
    for(int i=0; i<ntests; i++)
        ftimes << time[i] << endl;
}



//Helper function .. nice job
void get_max_words(ifstream& fin, int& wordlen, int& numwords)
{
    char c;
    int count=0;
    wordlen = numwords = 0;

    while(fin.good() && fin.get(c) && isspace(c)){;} //skip leading space
    while(fin.good())
    {
        ++numwords;
        while(fin.good() && !isspace(c)) 
        {
            ++count;
            fin.get(c);
        }
        if(count > wordlen)
            wordlen = count;
        count = 0;
        while(fin.good() && fin.get(c) && isspace(c)){;} //skip space
    }   
    if(count > wordlen)
        wordlen = count;
    fin.clear();
    fin.seekg(0, ios::beg);
}
3
  • 2
    1) Re-implement a general-purpose sorting function which accepts a general comparator. 2) Pass an appropriately constructed comparator (trivial). — There is no reason at all to conflate these two problems into one. Commented Sep 2, 2014 at 16:49
  • When you implement a sorting routine from scratch, you will come to the point where you must write the comparator. In your case, make the comparator compare lengths first, and only perform a lexical compare on words of the same length. (This is similar to what Konrad Rudolph said, but with the comparator embedded instead of called.) Commented Sep 2, 2014 at 17:09
  • @A.I.Breveleri I was thinking to put the array in multi dimension array. first dimension is words with one character and second row is words with two character and so on.. And then sort each row independently. Do you think it is good sufficient way? Commented Sep 2, 2014 at 17:17

4 Answers 4

3

You'll primarily need a comparator for your sort routine to sort on:

bool lessThan(const std::string a, const std::string b) {
    if (a.length() != b.length()) 
         return a.length() < b.length();
    return a < b;
}
Sign up to request clarification or add additional context in comments.

7 Comments

OP doesn't want sort to be present since he 's not allowed to use library.
@nevets i said "your sort" meaning the OP's
@nevets this answer doesn't place any restrictions on how the given function should be used. Yes it's usable for std::sort but it should be just as usable in a home-grown sorting function.
@MarkRansom Do you know any example of home-grown sorting function combined with this. I need a little bit more hint.
@Bernard at some point in any sorting routine, two elements have to be compared to see which one goes first, hence if (lessThan(a, b)) { // ...
|
1

There's actually an easy way to implement this in stl. There's a sort method that takes a comparator:

template <class RandomAccessIterator, class Compare>
  void sort (RandomAccessIterator first, RandomAccessIterator last, Compare comp);

So you can do this:

bool comparator(const string& a, const string& b) {
    if (a.length() < b.length())
        return true;
    if (a.length() == b.length())
        return a < b;
    return false;
}

sort(words.begin(), words.end(), comparator);

2 Comments

As I mentioned in question I do not want to use any library such as "sort"
OP doesn't want sort to be present.
1

It's about sorting based on multiple keys. I suggest you study some efficient sorting algorithm, say Quick Sort, then change the comparator to adapt the multiple keys.

For any sorting algorithm that is based on comparing, the easiest way to adapt multiple key sorting is to change the comparing criteria, from a single value to multiple values.

If you are not even allowed to use STL, i.e. you are not allowed to use sort in , here is a post you can start with: Sorting an array using multiple sort criteria (QuickSort)

If you are allowed, just write a comparing function which supports the multiple key comparison and plug it in the sort function. You can check this C++ reference for more details.

An illustration (it's just an illustration to point out how you can plug in the compare function):

bool comparator(const string& a, const string& b) {
    if (a.length() < b.length())
        return true;
    if (a.length() > b.length())
        return false;

    return a < b;
}

void Qsort(string a[],int low,int high)
{
    if(low >= high)
    {
        return;
    }

    int left = low;
    int right = high;
    string key = a[(low + high) >> 1];

    while(left < right)
    {
        while(left < right && comparator(a[left], key)) left++;     
        while(left < right && !comparator(a[right], key)) right--;

        if (left < right)
        {
             swap(a[left], a[right]);
             left++; right--;
        }
    }

    if (left == right) left ++;

    if (low < right) Qsort(a, low, left - 1);
    if (high > left) Qsort(a, right + 1, high);
}

3 Comments

Thanks. Link is helping a lot. Unfortunately, I am not allowed to use STL.
@Bernard check the SO thread I posted in my answer, it's quite useful in your case :)
Your answer looks complete. I will implement it and finger cross if it works I accept your answer as solved very soon. Thanks.
1

The answer wants a design, so I'll focus on the design of your sorting library, than an implementation

Your sort algorithm can use your custom comparator objects with a member operator() implemented for comparison between two elements.

Your comparator can be a Linked List of comparators and can call the next comparator if the current one gives a tie. You'll have to ensure that there is always a true and false return though. Or implement something that can create a stable_sort if nothing else.

So the first comparator is number of characters and the second comparator is lexicographical.. This idea is then general enough so that if your requirement changes tomorrow. This can then be reused.

This is on the lines of Chain of Responsibility Pattern. You can templat-ize the comparator after you've got the gist.

Ex:

class Chain_Comparator
{
    Chain_Comparator* next;
public:
     bool operator()( void* a, void* b )
     {
          if( a_is_less_b(a, b) )
              return true;
          else if( b_is_less_a(a,b) )
              return false;
          else if( next )
              return next( a, b )    
     }    
    virtual bool a_is_less( void* a, void* b) = 0;
    virtual bool b_is_less( void* a, void* b) = 0;
 };

class Num_Comparator : public Chain_Comparator
{
    // Implements a_is_less etc.
};
class Lex_Comparator : public Chain_Comparator
{
  // Implement lex comparisons.
};

void your_custom_sorting_method( vector<int > a, Chain_Comparator& c)
{
// Implementation goes here.
//  call the operator() for c with simply : c( a[i], a[j] )

}

Comments

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.