Hello Everyone! This is a Vector Space Model (Search Ranking Algorithm) in C# Implementation Demo. This algorithm is related to Information Retrieval domain. For preprocessing apply porter stemming algorithm then complete VSM using cosine similarity.

Source Code of Porter Stemming Algorithm:

using System;
public class Program
{
    public static void Main()
    {
        var stemmer = new PorterStemmer();
        var stem = stemmer.StemWord("zohair is running");
        Console.WriteLine(stem);
    }
}
///
/// The Stemmer class transforms a word into its root form.
/// Implementing the Porter Stemming Algorithm
///

///
/// Modified from: http://tartarus.org/martin/PorterStemmer/csharp2.txt
///

///
/// var stemmer = new PorterStemmer();
/// var stem = stemmer.StemWord(word);
///

public class PorterStemmer
{
    // The passed in word turned into a char array.
    // Quicker to use to rebuilding strings each time a change is made.
    private char[] wordArray;
    // Current index to the end of the word in the character array. This will
    // change as the end of the string gets modified.
    private int endIndex;
    // Index of the (potential) end of the stem word in the char array.
    private int stemIndex;
    ///
    /// Stem the passed in word.
    ///

    /// Word to evaluate
    ///
    public string StemWord(string word)
    {
        // Do nothing for empty strings or short words.
        if (string.IsNullOrWhiteSpace(word) || word.Length <= 2) return word;
        wordArray = word.ToCharArray();
        stemIndex = 0;
        endIndex = word.Length - 1;
        Step1();
        Step2();
        Step3();
        Step4();
        Step5();
        Step6();
        var length = endIndex + 1;
        return new String(wordArray, 0, length);
    }
    // Step1() gets rid of plurals and -ed or -ing.
    /* Examples:
           caresses  ->  caress
           ponies    ->  poni
           ties      ->  ti
           caress    ->  caress
           cats      ->  cat
           feed      ->  feed
           agreed    ->  agree
           disabled  ->  disable
           matting   ->  mat
           mating    ->  mate
           meeting   ->  meet
           milling   ->  mill
           messing   ->  mess
           meetings  ->  meet           */
    private void Step1()
    {
        // If the word ends with s take that off
        if (wordArray[endIndex] == 's')
        {
            if (EndsWith("sses"))
            {
                endIndex -= 2;
            }
            else if (EndsWith("ies"))
            {
                SetEnd("i");
            }
            else if (wordArray[endIndex - 1] != 's')
            {
                endIndex--;
            }
        }
        if (EndsWith("eed"))
        {
            if (MeasureConsontantSequence() > 0)
                endIndex--;
        }
        else if ((EndsWith("ed") || EndsWith("ing")) && VowelInStem())
        {
            endIndex = stemIndex;
            if (EndsWith("at"))
                SetEnd("ate");
            else if (EndsWith("bl"))
                SetEnd("ble");
            else if (EndsWith("iz"))
                SetEnd("ize");
            else if (IsDoubleConsontant(endIndex))
            {
                endIndex--;
                int ch = wordArray[endIndex];
                if (ch == 'l' || ch == 's' || ch == 'z')
                    endIndex++;
            }
            else if (MeasureConsontantSequence() == 1 && IsCVC(endIndex)) SetEnd("e");
        }
    }
    // Step2() turns terminal y to i when there is another vowel in the stem.
    private void Step2()
    {
        if (EndsWith("y") && VowelInStem())
            wordArray[endIndex] = 'i';
    }
    // Step3() maps double suffices to single ones. so -ization ( = -ize plus
    // -ation) maps to -ize etc. note that the string before the suffix must give m() > 0.
    private void Step3()
    {
        if (endIndex == 0) return;
        /* For Bug 1 */
        switch (wordArray[endIndex - 1])
        {
            case 'a':
                if (EndsWith("ational")) { ReplaceEnd("ate"); break; }
                if (EndsWith("tional")) { ReplaceEnd("tion"); }
                break;
            case 'c':
                if (EndsWith("enci")) { ReplaceEnd("ence"); break; }
                if (EndsWith("anci")) { ReplaceEnd("ance"); }
                break;
            case 'e':
                if (EndsWith("izer")) { ReplaceEnd("ize"); }
                break;
            case 'l':
                if (EndsWith("bli")) { ReplaceEnd("ble"); break; }
                if (EndsWith("alli")) { ReplaceEnd("al"); break; }
                if (EndsWith("entli")) { ReplaceEnd("ent"); break; }
                if (EndsWith("eli")) { ReplaceEnd("e"); break; }
                if (EndsWith("ousli")) { ReplaceEnd("ous"); }
                break;
            case 'o':
                if (EndsWith("ization")) { ReplaceEnd("ize"); break; }
                if (EndsWith("ation")) { ReplaceEnd("ate"); break; }
                if (EndsWith("ator")) { ReplaceEnd("ate"); }
                break;
            case 's':
                if (EndsWith("alism")) { ReplaceEnd("al"); break; }
                if (EndsWith("iveness")) { ReplaceEnd("ive"); break; }
                if (EndsWith("fulness")) { ReplaceEnd("ful"); break; }
                if (EndsWith("ousness")) { ReplaceEnd("ous"); }
                break;
            case 't':
                if (EndsWith("aliti")) { ReplaceEnd("al"); break; }
                if (EndsWith("iviti")) { ReplaceEnd("ive"); break; }
                if (EndsWith("biliti")) { ReplaceEnd("ble"); }
                break;
            case 'g':
                if (EndsWith("logi"))
                {
                    ReplaceEnd("log");
                }
                break;
        }
    }
    /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
    private void Step4()
    {
        switch (wordArray[endIndex])
        {
            case 'e':
                if (EndsWith("icate")) { ReplaceEnd("ic"); break; }
                if (EndsWith("ative")) { ReplaceEnd(""); break; }
                if (EndsWith("alize")) { ReplaceEnd("al"); }
                break;
            case 'i':
                if (EndsWith("iciti")) { ReplaceEnd("ic"); }
                break;
            case 'l':
                if (EndsWith("ical")) { ReplaceEnd("ic"); break; }
                if (EndsWith("ful")) { ReplaceEnd(""); }
                break;
            case 's':
                if (EndsWith("ness")) { ReplaceEnd(""); }
                break;
        }
    }
    /* step5() takes off -ant, -ence etc., in context vcvc. */
    private void Step5()
    {
        if (endIndex == 0) return;
        switch (wordArray[endIndex - 1])
        {
            case 'a':
                if (EndsWith("al")) break; return;
            case 'c':
                if (EndsWith("ance")) break;
                if (EndsWith("ence")) break; return;
            case 'e':
                if (EndsWith("er")) break; return;
            case 'i':
                if (EndsWith("ic")) break; return;
            case 'l':
                if (EndsWith("able")) break;
                if (EndsWith("ible")) break; return;
            case 'n':
                if (EndsWith("ant")) break;
                if (EndsWith("ement")) break;
                if (EndsWith("ment")) break;
                /* element etc. not stripped before the m */
                if (EndsWith("ent")) break; return;
            case 'o':
                if (EndsWith("ion") && stemIndex >= 0 && (wordArray[stemIndex] == 's' || wordArray[stemIndex] == 't')) break;
                /* j >= 0 fixes Bug 2 */
                if (EndsWith("ou")) break; return;
            /* takes care of -ous */
            case 's':
                if (EndsWith("ism")) break; return;
            case 't':
                if (EndsWith("ate")) break;
                if (EndsWith("iti")) break; return;
            case 'u':
                if (EndsWith("ous")) break; return;
            case 'v':
                if (EndsWith("ive")) break; return;
            case 'z':
                if (EndsWith("ize")) break; return;
            default:
                return;
        }
        if (MeasureConsontantSequence() > 1)
            endIndex = stemIndex;
    }
    /* step6() removes a final -e if m() > 1. */
    private void Step6()
    {
        stemIndex = endIndex;
        if (wordArray[endIndex] == 'e')
        {
            var a = MeasureConsontantSequence();
            if (a > 1 || a == 1 && !IsCVC(endIndex - 1))
                endIndex--;
        }
        if (wordArray[endIndex] == 'l' && IsDoubleConsontant(endIndex) && MeasureConsontantSequence() > 1)
            endIndex--;
    }
    // Returns true if the character at the specified index is a consonant.
    // With special handling for 'y'.
    private bool IsConsonant(int index)
    {
        var c = wordArray[index];
        if (c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u') return false;
        return c != 'y' || (index == 0 || !IsConsonant(index - 1));
    }
    /* m() measures the number of consonant sequences between 0 and j. if c is
       a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
       presence,
                gives 0
          vc     gives 1
          vcvc   gives 2
          vcvcvc gives 3
          ....             */
    private int MeasureConsontantSequence()
    {
        var n = 0;
        var index = 0;
        while (true)
        {
            if (index > stemIndex) return n;
            if (!IsConsonant(index)) break; index++;
        }
        index++;
        while (true)
        {
            while (true)
            {
                if (index > stemIndex) return n;
                if (IsConsonant(index)) break;
                index++;
            }
            index++;
            n++;
            while (true)
            {
                if (index > stemIndex) return n;
                if (!IsConsonant(index)) break;
                index++;
            }
            index++;
        }
    }
    // Return true if there is a vowel in the current stem (0 ... stemIndex)
    private bool VowelInStem()
    {
        int i;
        for (i = 0; i <= stemIndex; i++)
        {
            if (!IsConsonant(i)) return true;
        }
        return false;
    }
    // Returns true if the char at the specified index and the one preceeding it are the same consonants.
    private bool IsDoubleConsontant(int index)
    {
        if (index < 1) return false;
        return wordArray[index] == wordArray[index - 1] && IsConsonant(index);
    }
    /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
       and also if the second c is not w,x or y. this is used when trying to
       restore an e at the end of a short word. e.g.
          cav(e), lov(e), hop(e), crim(e), but
          snow, box, tray.       */
    private bool IsCVC(int index)
    {
        if (index < 2 || !IsConsonant(index) || IsConsonant(index - 1) || !IsConsonant(index - 2)) return false;
        var c = wordArray[index];
        return c != 'w' && c != 'x' && c != 'y';
    }
    // Does the current word array end with the specified string.
    private bool EndsWith(string s)
    {
        var length = s.Length;
        var index = endIndex - length + 1;
        if (index < 0) return false;
        for (var i = 0; i < length; i++)
        {
            if (wordArray[index + i] != s[i]) return false;
        }
        stemIndex = endIndex - length;
        return true;
    }
    // Set the end of the word to s.
    // Starting at the current stem pointer and readjusting the end pointer.
    private void SetEnd(string s)
    {
        var length = s.Length;
        var index = stemIndex + 1;
        for (var i = 0; i < length; i++)
        {
            wordArray[index + i] = s[i];
        }
        // Set the end pointer to the new end of the word.
        endIndex = stemIndex + length;
    }
    // Conditionally replace the end of the word
    private void ReplaceEnd(string s)
    {
        if (MeasureConsontantSequence() > 0) SetEnd(s);
    }
}

Post a Comment

Please Comment !! Any Problem or Any Suggestion

Previous Post Next Post