2016-09-08 32 views
1

我正在嘗試編寫比代碼讀取全部cran字段的Java代碼(Information Retrial中的熱門話題),以便進行標記,計數總標記,找到50個常用詞並刪除預先定義的停用詞。 它的工作原理除了StopWordsRemoval方法(代碼中的最後一個), 它不會根據需要更改輸出,此方法之前/之後的輸出是相同的!在java中停止詞的去除方法不工作

你能幫我弄清楚是什麼問題嗎? 它是在Java中,我的第一個代碼:(

import java.io.*; 
import java.util.*; 

public class Information_Retrieval_Hw1 { 

    //Global variables 
    public static BufferedReader buffer; 
    public static Hashtable<String, Integer> wordList = new Hashtable<String, Integer>(); 
    public static ArrayList<Hashtable <String,Integer>> fileMap = new ArrayList<Hashtable<String,Integer>>(); 
    public static Set<String> tagNames = new HashSet<String>(); 
    //public static ArrayList<Map.Entry<String, Integer>> list; 

    public static int documentsCount = 0; 
    public static int totalTokens = 0; 
    public static int uniqueWords = 0; 
    public static int tagCount = 0; 
    public static int singleOccureneWords = 0; 

    public static ArrayList<Map.Entry<String, Integer>> sortedList; 


    public Information_Retrieval_Hw1() { 
     // TODO Auto-generated constructor stub 
    } 

    public static void main(String[] args) throws IOException { 

     String cranfield = "/Users/Manal/Desktop/semster1/IR/assigenment 1/cranfieldDocs"; 
     File cranfieldFiles = new File(cranfield); 
     ReadFile(cranfieldFiles); 

     System.out.println("Total number of documents: " + fileMap.size()); 

     //Calculate total number of tokens 
     totalTokens = CalculateNumberOfTokens(wordList); 
     System.out.println("Total number Of words = " + totalTokens); 

     //Calculate number of unique words 
     uniqueWords = CalculateUniqueWords(wordList); 
     System.out.println("Total number Of distinct words = " + uniqueWords); 

     //Calculate number of unique words 
     singleOccureneWords = CalculateSingleOccurenceWords(wordList); 
     System.out.println("Total number Of words that occur only once = " + singleOccureneWords); 

     //Find the 30 most frequent words 
     FindFiftyMostFrequentWords(wordList); 

     StopWordsRemoval (cranfieldFiles,wordList); 
     //reprint all information after removing stopword; 

     System.out.println("\n***********************************\nAfter removing stop words \n***********************************\n"); 

     //Calculate total number of tokens 
     totalTokens = CalculateNumberOfTokens(wordList); 
     System.out.println("Total number Of words = " + totalTokens); 

     //Calculate number of unique words 
     uniqueWords = CalculateUniqueWords(wordList); 
     System.out.println("Total number Of distinct words = " + uniqueWords); 

     //Calculate number of unique words 
     singleOccureneWords = CalculateSingleOccurenceWords(wordList); 
     System.out.println("Total number Of words that occur only once = " + singleOccureneWords); 

     //Find the 30 most frequent words 
     FindFiftyMostFrequentWords(wordList); 

    } 

    public static void ReadFile(File cranfieldFiles) throws IOException{ 
     for (File file: cranfieldFiles.listFiles()) 
     { 
      //read files recursively if path contains folder 
      if(file.isDirectory()) 
      { 
       ReadFile(file); 
      } 

      else 
      { 
       documentsCount++; 
       try 
       { 
        buffer = new BufferedReader(new FileReader(file)); 
       } 
       catch (FileNotFoundException e) 
       { 
        System.out.println("File not Found"); 

       } 
       //find the tags and their count 
       tagCount = tagCount + TagHandler(file, tagNames); 
       //find words in the cranfield 
       TokenHandler(file, tagNames); 

      } 
     } 


    } 

    public static int TagHandler(File file, Set<String> tagNames) throws IOException 
    { 
     String line; 
     int tag_count = 0; 


     buffer = new BufferedReader(new FileReader(file)); 
     while((line = buffer.readLine()) != null) 
     { 
     /* 
     * If the line contains a '<', it is considered a tag and tag_count is incremented. 
     */ 
      if(line.contains("<")) 
      { 
       tag_count++; 

       String b = line.replaceAll("[<*>/]", ""); 
       tagNames.add(b); 
      } 

     } 
     tag_count/=2; //Since each tag represent the beginning and the end, we divide it by two to get the actual count. 
     return tag_count; 
    } 

    public static void TokenHandler(File file, Set<String> tagNames) throws IOException 
    { 
     String line; 
     String words[]; 

     buffer = new BufferedReader(new FileReader(file)); 
     Hashtable<String, Integer> tempMap = new Hashtable<String, Integer>(); 

     while((line = buffer.readLine()) != null) 
     { 

      String s1 = line.replaceAll("[^a-zA-Z.]+"," "); //Replace everything that is not an alphabet with a blank space. 
      String s2 = s1.replaceAll("[.]", "");//Replace words with . (eg U.S) as 1 word 
      words = s2.split(" "); 

      for(String word : words) 
      { 
       //Handle the tags properly 
       if(!tagNames.contains(word) && !word.equals("")) 
       { 
        word = word.toLowerCase(); // Converts all words to lower case. 

        //add word if it isn't added already 
        if(!wordList.containsKey(word)) 
        { 
         //first occurance of this word 
         wordList.put(word, 1); 

         //Following is to compute the unique words in each document 
         if(!tempMap.containsKey(word)) 
         { 
          tempMap.put(word,1); 

         } 
         else 
         { 
          tempMap.put(word, tempMap.get(word)+ 1); 

         } 
        } 
        else 
        { 
         //Increament the count of that word 
         wordList.put(word, wordList.get(word) + 1); 
         if(!tempMap.containsKey(word)) 
         { 
          tempMap.put(word,1); 

         } 
         else 
         { 
          tempMap.put(word, tempMap.get(word)+ 1); 
         } 
        } 
       } 
      } 
     } 

     //Add count to file map and after reading every file 
     fileMap.add(tempMap); 
    } 

    //Function to find the total number of tokens in the cranfield database 

    public static int CalculateNumberOfTokens(Hashtable<String, Integer> myWordList) 
    { 
     int noOfTokens = 0; 

     for (Integer value: myWordList.values()) 
     { 
      noOfTokens = noOfTokens + value; 
     } 
     return noOfTokens; 
    } 

    public static int CalculateUniqueWords(Hashtable<String, Integer> myWordList) 
    { 

     return myWordList.size(); 
    } 

    public static int CalculateSingleOccurenceWords(Hashtable<String, Integer> myWordList) 
    { 
     int count = 0; 

     for (Integer value: myWordList.values()) 
     { 
      if(value == 1) 
      { 
       count++; 
      } 
     } 
     return count; 
    } 

    //Sorting the hashTable 
    public static ArrayList<Map.Entry<String, Integer>> SortHashTable(Hashtable<String, Integer> myWordList) 
    { 
     ArrayList<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(myWordList.entrySet()); 
     Collections.sort(list, new Comparator<Map.Entry<String, Integer>>(){ 
      public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) { 
       return o2.getValue().compareTo(o1.getValue()); 
      }}); 
     return list; 
    } 

    public static void FindFiftyMostFrequentWords(Hashtable<String, Integer> myWordList) 
    { 
     //Sort the hashtable based on value 

     sortedList = SortHashTable(myWordList); 
     System.out.println("The 50 most frequent words are: "); 
     for(int i=0;i<50;i++) 
     { 
      System.out.println("\t" + (i+1) + "." + " " + sortedList.get(i)); 
     } 
    } 

    public static Hashtable<String, Integer> StopWordsRemoval (File file, Hashtable<String, Integer> wordList) throws IOException { 
     int k=0,j; 
     String sCurrentLine; 
     String[] stopwords = new String[2000]; 
     try 
     { 
      FileReader fr=new FileReader("/Users/Manal/Desktop/semster1/IR/assigenment 1/xid-10624858_1.txt"); 
      BufferedReader br= new BufferedReader(fr); 
      while ((sCurrentLine = br.readLine()) != null){ 
       stopwords[k]=sCurrentLine; 
       k++; 
      } 
      Set<String> keys = wordList.keySet(); 
      for(String key: keys) 
      { 
       for(j = 0; j < k; j++) 
       { 
        if(wordList.keySet().equals(stopwords[j])) 
         wordList.remove(key); 
       } 
      } 
     } 
     catch(Exception ex) 
     {System.out.println(ex);} 

     return wordList; 
    } 
} 
+0

我想你必須將if語句從「if(wordList.keySet()。equals(stopwords [j]))」更改爲「if(key.equals(stopwords [j]))」 – Eritrean

+0

我試過了,但我想知道它只是刪除一個單詞? –

回答

0

我認爲這是在代碼中的問題

if(wordList.keySet().equals(stopwords[j])) 

你正在做的有什麼檢查,如果密鑰集等於字(keySet()返回集)與密鑰集是否包含這個詞試試這個:。

if(wordList.keySet().contains(stopwords[j])) 

讓我知道是否能解決你的問題

+0

我嘗試過,但我想知道,它只是刪除一個詞? –