From 34ae9c4f3c712d2bf3e0544d024114a2cd8d14af Mon Sep 17 00:00:00 2001 From: amorphousxd Date: Sun, 5 Jan 2014 14:14:50 +0400 Subject: [PATCH] DBHelper reworked --- src/corpus_parser/DatabaseHelper.java | 9 ++--- src/corpus_parser/Main.java | 9 ++++- src/corpus_parser/StatsManagement.java | 2 +- src/corpus_parser/StringHelper.java | 5 ++- src/corpus_parser/Word.java | 1 + src/corpus_parser/finnish/ParserFI.java | 4 +- src/corpus_parser/finnish/WordFI.java | 3 +- src/corpus_parser/italian/ParserITA.java | 5 --- src/corpus_parser/italian/WordITA.java | 7 ++-- src/corpus_parser/russian/ParserRU.java | 23 +++++------ src/corpus_parser/russian/WordRU.java | 49 ++++++++++++++++-------- 11 files changed, 72 insertions(+), 45 deletions(-) diff --git a/src/corpus_parser/DatabaseHelper.java b/src/corpus_parser/DatabaseHelper.java index 435b7da..e4f5869 100644 --- a/src/corpus_parser/DatabaseHelper.java +++ b/src/corpus_parser/DatabaseHelper.java @@ -107,7 +107,7 @@ public void insertWord(int _internalId, int _dom, String _lemma, String _link, S System.out.println("SQL Statement: " + wordStatement.asSql()); wordStatement.executeUpdate(); } finally { - wordStatement.close(); //without that there should be memory leak (byte[] not GC'd) + wordStatement.close(); } } catch (SQLException e) { e.printStackTrace(); @@ -154,12 +154,11 @@ public int insertSentence(int _internalId, String _sentence, int _text_id){ public static void truncateTable(String tableName){ try { - Statement trunkate = (Statement) c.createStatement(); - PreparedStatement ptruncateWords = (PreparedStatement) c.prepareStatement("TRUNCATE TABLE "+tableName); + PreparedStatement truncateStatement = (PreparedStatement) c.prepareStatement("TRUNCATE TABLE "+tableName); try{ - ptruncateWords.executeUpdate(); + truncateStatement.executeUpdate(); } finally { - ptruncateWords.close(); + truncateStatement.close(); } } catch (SQLException e) { e.printStackTrace(); diff --git a/src/corpus_parser/Main.java b/src/corpus_parser/Main.java index 4cb005d..a111c85 100644 --- a/src/corpus_parser/Main.java +++ b/src/corpus_parser/Main.java @@ -27,9 +27,9 @@ public static void main(String[] args) { ///////////////////////////////////////////////////////////// - DatabaseHelper.truncateTable("words"); //очищаем таблицу words + /*DatabaseHelper.truncateTable("words"); //очищаем таблицу words DatabaseHelper.truncateTable("texts"); - DatabaseHelper.truncateTable("sentences"); + DatabaseHelper.truncateTable("sentences"); */ final File folderRU = new File("C:\\corpus\\corpus_ru"); String resultsPath = "C:\\corpus_stats\\results_rus.csv"; @@ -45,11 +45,16 @@ public static void main(String[] args) { StatsManagement.getStats(folderITA, StatsManagement.CorpusLanguage.ITALIAN, dbhelper); StatsManagement.writeStats(resultsPath, true);*/ + //////////////////////////////////////////////////////////// + /*final File folderSW = new File("C:\\corpus\\corpus_swe"); String resultsPath = "C:\\corpus_stats\\results_swe.csv"; StatsManagement.getStats(folderSW, StatsManagement.CorpusLanguage.SWEDISH, dbhelper); StatsManagement.writeStats(resultsPath, true);*/ + + /////////////////////////////////////////////////////////// + /*final File folderITA = new File("C:\\corpus\\corpus_deu"); String resultsPath = "C:\\corpus_stats\\results_deu.csv"; diff --git a/src/corpus_parser/StatsManagement.java b/src/corpus_parser/StatsManagement.java index 7ad1f72..160ae8a 100644 --- a/src/corpus_parser/StatsManagement.java +++ b/src/corpus_parser/StatsManagement.java @@ -55,7 +55,7 @@ public static void getStats(final File folder, CorpusLanguage language, Database p = new ParserGER(fileEntry.getAbsolutePath(), _dbhelper); break; } - p.getStats(); + p.getStats(); } } } diff --git a/src/corpus_parser/StringHelper.java b/src/corpus_parser/StringHelper.java index 15d8131..42b285d 100644 --- a/src/corpus_parser/StringHelper.java +++ b/src/corpus_parser/StringHelper.java @@ -1,5 +1,7 @@ package corpus_parser; +import java.util.ArrayList; +import java.util.List; import java.util.Vector; /** @@ -9,10 +11,11 @@ * Time: 10:23 * To change this template use File | Settings | File Templates. */ +//Splits string with Regular Expression, just to avoid Java String.split() memory leak, works the same way public abstract class StringHelper { public static String[] splitString(String str, String regex){ { - Vector result = new Vector(); + List result = new ArrayList(); int start = 0; int pos = str.indexOf(regex); while (pos>=start) { diff --git a/src/corpus_parser/Word.java b/src/corpus_parser/Word.java index 2504888..cd37637 100644 --- a/src/corpus_parser/Word.java +++ b/src/corpus_parser/Word.java @@ -7,4 +7,5 @@ * Time: 22:47 */ public interface Word { + //public void getProperties(); } diff --git a/src/corpus_parser/finnish/ParserFI.java b/src/corpus_parser/finnish/ParserFI.java index 7af62f5..e49ce2e 100644 --- a/src/corpus_parser/finnish/ParserFI.java +++ b/src/corpus_parser/finnish/ParserFI.java @@ -71,8 +71,10 @@ public void parse(String fileName){ Node sentenceNode = sentences.item(i); if (sentenceNode.getNodeType() == Node.ELEMENT_NODE) { - Element sentenceElement = (Element) sentenceNode; + //insert into db + + // NodeList words = sentenceElement.getElementsByTagName(XML_NODE_WORD); NodeList dependencies = sentenceElement.getElementsByTagName(WORD_ATTR_DEP_ID); //пригодится чуть ниже !!! HashMap wordsMap = new HashMap(); diff --git a/src/corpus_parser/finnish/WordFI.java b/src/corpus_parser/finnish/WordFI.java index 933f4d6..065b80c 100644 --- a/src/corpus_parser/finnish/WordFI.java +++ b/src/corpus_parser/finnish/WordFI.java @@ -1,5 +1,6 @@ package corpus_parser.finnish; +import corpus_parser.StringHelper; import corpus_parser.Word; /** @@ -26,6 +27,6 @@ public WordFI(int _dom, String _feat, int _id, String _lemma, String _link) this.id = _id; this.lemma = _lemma; this.link = _link; - this.featValues = this.feat.split(" "); + this.featValues = StringHelper.splitString(_feat, " "); } } diff --git a/src/corpus_parser/italian/ParserITA.java b/src/corpus_parser/italian/ParserITA.java index 730a067..db402cd 100644 --- a/src/corpus_parser/italian/ParserITA.java +++ b/src/corpus_parser/italian/ParserITA.java @@ -57,11 +57,6 @@ public void parse(String fileName){ if(WordDependency==null) WordDependency="ERROR"; if(WordDependency.length()<3) WordDependency="[0;0]"; - /*System.out.println("==========="); - System.out.println(WordID); - System.out.println(WordFeatures); - System.out.println(WordDependency); */ - WordITA w = new WordITA(WordFeatures, WordID, WordDependency); wordsMapDouble.put(WordID,w); } diff --git a/src/corpus_parser/italian/WordITA.java b/src/corpus_parser/italian/WordITA.java index 1af1f83..f1f05fa 100644 --- a/src/corpus_parser/italian/WordITA.java +++ b/src/corpus_parser/italian/WordITA.java @@ -1,5 +1,6 @@ package corpus_parser.italian; +import corpus_parser.StringHelper; import corpus_parser.Word; import java.util.HashMap; @@ -26,10 +27,10 @@ public WordITA(String _feat, double _id, String _dependency) { this.feat = _feat; this.id = _id; - this.featValues = this.feat.split(" "); + this.featValues = StringHelper.splitString(_feat, " "); this.dependency = _dependency; - this.lemma = featValues[0].substring(1, featValues[0].length()); - this.dependencyValues = this.dependency.split(";"); + this.lemma = featValues[0].substring(1, featValues[0].length());//.intern(); + this.dependencyValues = StringHelper.splitString(dependency, ";"); this.dom = Double.valueOf(dependencyValues[0].substring(1, dependencyValues[0].length())); this.link = dependencyValues[1].substring(0, dependencyValues[1].length()-1); } diff --git a/src/corpus_parser/russian/ParserRU.java b/src/corpus_parser/russian/ParserRU.java index c9059f1..2e42741 100644 --- a/src/corpus_parser/russian/ParserRU.java +++ b/src/corpus_parser/russian/ParserRU.java @@ -11,9 +11,7 @@ import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import java.io.*; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; +import java.util.*; /** * Created with IntelliJ IDEA. @@ -23,7 +21,8 @@ */ public class ParserRU extends Parser { - public HashMap languageProperties = new HashMap(); //все св-ва языка + public HashMap languageCategoriesMeta = new HashMap(); //все св-ва языка + public List languagePropertiesValues = new ArrayList(); private HashMap sentenceMap = new HashMap(); private Document doc; @@ -56,6 +55,8 @@ public class ParserRU extends Parser { public ParserRU(String fileName, DatabaseHelper _dbhelper) { getMeta(META_FILE_NAME); + for(int i=0; i< languagePropertiesValues.size();i++) + System.out.println(languagePropertiesValues.get(i)); this.dbhelper = _dbhelper; @@ -147,7 +148,8 @@ public void parse(String fileName) { Integer.valueOf(wordElement.getAttribute(WORD_ATTR_ID)), wordElement.getAttribute(WORD_ATTR_LEMMA), link, - this.languageProperties); + this.languageCategoriesMeta, + this.languagePropertiesValues); //передаем слово и его хар-ки в db //insert word data into db this.dbhelper.insertWord( @@ -193,10 +195,10 @@ public void getStats() { if(word.id < parent.id) { String delimiter = "<"; - bigram = word.featValues[0] + delimiter + parent.featValues[0]; + bigram = word.partOfSpeech + delimiter + parent.partOfSpeech; } else { String delimiter = ">"; - bigram = parent.featValues[0] + delimiter + word.featValues[0]; + bigram = parent.partOfSpeech + delimiter + word.partOfSpeech; } if (StatsManagement.stats.containsKey(bigram)) { @@ -214,10 +216,9 @@ public void getMeta(String metaFileName){ String metaString; while ((metaString = br.readLine()) != null) { String[] metaStringSplitted = metaString.split(";"); - //обработка зарезервированных слов sql - if(metaStringSplitted[1].matches("case")) metaStringSplitted[1]="`case`"; - // - languageProperties.put(metaStringSplitted[0],metaStringSplitted[1]); + languageCategoriesMeta.put(metaStringSplitted[0], metaStringSplitted[1]); + if(!languagePropertiesValues.contains(metaStringSplitted[1])) + languagePropertiesValues.add(metaStringSplitted[1]); } } catch (FileNotFoundException e) { e.printStackTrace(); diff --git a/src/corpus_parser/russian/WordRU.java b/src/corpus_parser/russian/WordRU.java index ca1e37e..a28b53f 100644 --- a/src/corpus_parser/russian/WordRU.java +++ b/src/corpus_parser/russian/WordRU.java @@ -3,11 +3,7 @@ import corpus_parser.StringHelper; import corpus_parser.Word; -import java.io.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; +import java.util.*; /** * Created with IntelliJ IDEA. @@ -23,13 +19,16 @@ public class WordRU implements Word { public String lemma; public String link; public String[] featValues; + public String partOfSpeech; public List properties; public List propertiesValues = new ArrayList(); - public HashMap languageProperties; + public HashMap languageCategoriesMeta; //передается из парсера мета + public List languagePropertiesValues; public HashMap databaseFields; - public WordRU(int _dom, String _feat, int _id, String _lemma, String _link, HashMap _languageProperties) + public WordRU(int _dom, String _feat, int _id, String _lemma, String _link, + HashMap _languageCategoriesMeta, List _languagePropertiesValues) { this.dom = _dom; this.feat = _feat; @@ -37,19 +36,30 @@ public WordRU(int _dom, String _feat, int _id, String _lemma, String _link, Hash this.lemma = _lemma; this.link = _link; this.featValues = StringHelper.splitString(_feat, " "); - this.languageProperties = _languageProperties; + this.partOfSpeech = featValues[0]; + this.languageCategoriesMeta = _languageCategoriesMeta; + this.languagePropertiesValues = _languagePropertiesValues; + //заполняем все значения категорий слова NULL + getPropertiesValues(); this.properties = getProperties(_feat); + for(int i=0; i getProperties(String _feat){ String[] featValues = StringHelper.splitString(_feat, " "); - List existingProperties = new ArrayList(/*Arrays.asList("internalId","domid","lemma","link","word","partOfSpeech")*/); - for(int i = 1; i < featValues.length; i++){ - if(languageProperties.containsKey(featValues[i])){ - existingProperties.add(languageProperties.get(featValues[i])); - propertiesValues.add(featValues[i]); + List existingProperties = new ArrayList(); + for(int i = 1; i < featValues.length; i++) + if (languageCategoriesMeta.containsKey(featValues[i])) { + existingProperties.add(languageCategoriesMeta.get(featValues[i])); + //if(propertiesValues.contains(this.languagePropertiesKeys)) + propertiesValues.add(languagePropertiesValues.indexOf(languageCategoriesMeta.get(featValues[i])), + featValues[i]); + } - } + return existingProperties; } @@ -57,4 +67,13 @@ private void setDatabaseFields(){ databaseFields.put(this.id,"internalId"); } + public void getPropertiesValues(){ + //создаем лист соответствий свойств языка и всех свойств, + //если какое-то св-во присутствует то на это место в листе + //становится значение, иначе 0 + for(int i=0;i