package us.deans.zinc; import java.util.ArrayList; import java.util.HashSet; import java.util.Properties; import java.util.Iterator; import java.util.Set; import java.util.regex.Pattern; import java.io.*; import org.apache.log4j.Logger; import us.deans.panther.ppModel; public class ZnModel extends ppModel { // Log4j private Logger logger = Logger.getLogger("ZincModel"); // Collections private ArrayList JobList; // List of job records (JobList) private ZnTermList IncludedTermList; // List of included term records private ZnTermList ExcludedTermList; // List of excluded term records private ZnTermList ExtractedTermList; // List of extracted term records (extracted from the JobList) private String rrUrl; private Properties props; // Iterators private Iterator itrTerms; private Iterator itrExTerms; private Iterator itrJobs; // Record Classes private ZnTermRecord recTerm; private ZnJobRecord recJob; // Data Class private ZnDbAdapter dataAdapter; // Compiled Patterns private Pattern regx; // Contructor public ZnModel(Properties props){ this.props = props; JobList = new ArrayList(); IncludedTermList = new ZnTermList(); ExcludedTermList = new ZnTermList(); ExtractedTermList = new ZnTermList(); logger.debug("model is loaded..."); } // Public Methods public void importRssData() { JobList.clear(); ZnRssReader rssReader = new ZnRssReader(); // need to add a controlled loop here so that any number of addresses and descriptions can be read in from the properties file. rrUrl = props.getProperty("uri01"); rssReader.readRss(JobList, rrUrl, props.getProperty("met01")); rrUrl = props.getProperty("uri02"); rssReader.readRss(JobList, rrUrl, props.getProperty("met02")); } public ArrayList getRssData() { return JobList; } public void extractTerms(){ /** public call to generate term list from job descriptions */ if (JobList.isEmpty()){ logger.debug("No input data."); return; } ExtractedTermList.clear(); readExcludedTerms(); // read excluded terms file -> ExcludedTermsList readIncludedTerms(); // read included terms file -> IncludedTermsList ProcessTermList(); // extract terms from job records, filter with excluded terms // writeExtractedTerms(); // readTermDataAsSimpleText(); writeTermDataAsSimpleText(); } public ZnTermList getInTermData(){ return IncludedTermList; } public void exportJobDataToDb(){ dataAdapter = new ZnDbAdapter(props, logger); dataAdapter.exportJobData(JobList); } public void importJobDataFromDb(){ dataAdapter = new ZnDbAdapter(props, logger); } public void processJobList(){ /** for each job record in the job list, compares the words in the job description field * to the words in the terms lists and writes the matches to the keyword field */ logger.info("Processing job records..."); String jobDescription = ""; String jobKeywords = ""; String[] buffer; Set TermSet = new HashSet(); // hash set used for word matching regx = Pattern.compile("\\W"); // compile regular expression String inTermFile = props.getProperty("inTermFile"); if (JobList.isEmpty()){ logger.debug("No input data."); return; } readExcludedTerms(); readIncludedTerms(); if (IncludedTermList.isEmpty()){ logger.debug("No included terms"); return; } // load hash table with previously included terms itrTerms = IncludedTermList.iterator(); while (itrTerms.hasNext()){ String strTerm = itrTerms.next().getTermItem(); if(!TermSet.add(strTerm)) logger.warn("Duplicate detected in included terms - check " + inTermFile + " file."); } itrJobs = JobList.iterator(); while(itrJobs.hasNext()){ // loop through job records in job list recJob = itrJobs.next(); jobDescription = recJob.getJobDescription(); buffer = regx.split(jobDescription); for(int i=0;i 1) jobKeywords = jobKeywords.substring(0, len-1); recJob.setJobKeywords(jobKeywords); jobKeywords = ""; logger.debug("created keywords: " + recJob.getJobKeywords()); } } // Private Methods private void ProcessTermList(){ /** Reads in each job description, extracts distinct words and adds those words to the candidate (extracted) term list */ logger.debug("Extracting terms from job records..."); String exTermFile = props.getProperty("exTermFile"); String inTermFile = props.getProperty("inTermFile"); int i=0; int dupCount=0; regx = Pattern.compile("(\\W|[...])"); // compile regx - [...] doesn't seem to work. Set TermSet = new HashSet(); // hash set insures unique words only to be included // load hash table with previously excluded terms itrExTerms = ExcludedTermList.iterator(); while (itrExTerms.hasNext()){ String strTerm = itrExTerms.next().getTermItem(); if(!TermSet.add(strTerm)) logger.warn("Duplicate detected in excluded terms - check " + exTermFile + " file."); } // load hash table with previously included terms - also add previously exluded terms to the new extracted term list. itrTerms = IncludedTermList.iterator(); while (itrTerms.hasNext()){ String strTerm = itrTerms.next().getTermItem(); if(!TermSet.add(strTerm)) logger.warn("Duplicate detected in included terms - check " + inTermFile + " file."); else{ recTerm = new ZnTermRecord(); recTerm.setTermItem(strTerm); recTerm.setTermCategory("extracted"); recTerm.setTermCount(1); ExtractedTermList.add(recTerm); // add new term to term list logger.debug("adding previously included term: " + strTerm); } } // add terms newly extracted from the description fields of the job list to the hash set itrJobs = JobList.iterator(); while (itrJobs.hasNext()){ // cycle through the job list String jobDescription = itrJobs.next().getJobDescription().trim(); // get job description string (subject) String[] tokens = regx.split(jobDescription); // parse subject for(i=0;i0){ if(!TermSet.add(tokens[i])){ logger.debug("Duplicate term detected : " + tokens[i]); dupCount++; } else{ recTerm = new ZnTermRecord(); recTerm.setTermItem(tokens[i].toString()); recTerm.setTermCategory("extracted"); recTerm.setTermCount(1); ExtractedTermList.add(recTerm); // add new term to term list logger.debug("adding extracted term: " + tokens[i].toString()); } } } // end of subject } // end of subject list logger.info(dupCount + " duplicates detected. " + TermSet.size() + " distinct words collected."); } // end of process private void readExcludedTerms(){ /** reads a list of excluded terms from a simple text file */ String exTermFile = props.getProperty("exTermFile"); StringBuffer buffer = new StringBuffer(); int ch; try{ FileInputStream fis = new FileInputStream(exTermFile); InputStreamReader reader = new InputStreamReader(fis); Reader in = new BufferedReader(reader); // read file into string buffer while((ch = in.read())>-1){ buffer.append((char)ch); } // break string into terms String[] terms = buffer.toString().split("\\n"); // store terms in term list for(int i=0;i-1){ buffer.append((char)ch); } // break string into terms String[] terms = buffer.toString().split("\\n"); // store terms in term list for(int i=0;i-1){ buffer.append((char)ch); } // break string into terms String[] terms = buffer.toString().split("\\n"); // store terms in term list for(int i=0;i