IData/src/Dataset.cpp

Go to the documentation of this file.
00001 //--------------------------------------------------------------------------
00002 // File and Version Information:
00003 //      $Id: Dataset.cpp 10652 2015-09-09 20:12:56Z davidsch@SLAC.STANFORD.EDU $
00004 //
00005 // Description:
00006 //      Class Dataset...
00007 //
00008 // Author List:
00009 //      Andy Salnikov
00010 //
00011 //------------------------------------------------------------------------
00012 
00013 //-----------------------
00014 // This Class's Header --
00015 //-----------------------
00016 #include "IData/Dataset.h"
00017 
00018 //-----------------
00019 // C/C++ Headers --
00020 //-----------------
00021 #include <cctype>
00022 #include <boost/lexical_cast.hpp>
00023 #include <boost/algorithm/string.hpp>
00024 #include <boost/filesystem.hpp>
00025 #include <boost/regex.hpp>
00026 #include <boost/format.hpp>
00027 
00028 //-------------------------------
00029 // Collaborating Class Headers --
00030 //-------------------------------
00031 #include "ExpNameDb/ExpNameDatabase.h"
00032 #include "IData/Exceptions.h"
00033 #include "MsgLogger/MsgLogger.h"
00034 
00035 //-----------------------------------------------------------------------
00036 // Local Macros, Typedefs, Structures, Unions and Forward Declarations --
00037 //-----------------------------------------------------------------------
00038 
00039 namespace fs = boost::filesystem;
00040 
00041 namespace {
00042 
00043   const char* logger = "IData.Dataset";
00044 
00045   // parse experiment name or id, returns true on success
00046   bool parseExpName(const std::string& exp, unsigned& expId, std::string& instrName, std::string& expName);
00047 
00048   // parse run list
00049   void parseRuns(const std::string& str, IData::Dataset::Runs& runs);
00050 
00051   // parse run list
00052   void parseStreams(const std::string& str, IData::Dataset::Streams& streams);
00053  
00054   // checks to see if the string is a file name
00055   bool isFileName(const std::string& str);
00056 
00057   // --------- begin helper functions for files() method  -------
00058 
00059   // returns set of all values in list of pairs, for stream and run specification
00060   template <class T>
00061   std::set<unsigned> listOfPairsAsSet(const T &listOfPairs) {
00062     std::set<unsigned> asSet;
00063     for (typename T::const_iterator pairIterator = listOfPairs.begin();
00064          pairIterator != listOfPairs.end(); ++pairIterator) {
00065       unsigned a = pairIterator->first;
00066       unsigned b = pairIterator->second;
00067       for (unsigned curVal = a; curVal <= b; ++curVal) {
00068         asSet.insert(curVal);
00069       }
00070     }
00071     return asSet;
00072   }
00073 
00074   /// return regular expression for the basename of a datafile, either hdf5 or xtc.
00075   /// For hdf5, match the run.
00076   /// for xtc, match both the run and stream.
00077   boost::regex constructDataFileRegEx(bool hdf5, bool smd, const std::string &experiment, unsigned expID) {    
00078     std::string reStr;
00079     if (hdf5) {
00080       reStr = boost::str(boost::format("%1%-r0*(\\d+)(-.*)?[.]h5") % experiment);
00081     } else if (smd) {
00082       reStr = boost::str(boost::format("e%1%-r0*(\\d+)-s([0-9]+)-c[0-9]+[.]smd[.]xtc") % expID);
00083     } else {
00084       reStr = boost::str(boost::format("e%1%-r0*(\\d+)-s([0-9]+)-c[0-9]+[.]xtc") % expID);    
00085     }
00086     boost::regex re(reStr);
00087     return re;
00088   }
00089 
00090   // --------- end helper functions for files() method  -------
00091 
00092   template <class T>
00093   std::vector<unsigned> listOfPairsAsVector(const T &listOfPairs) {
00094     std::vector<unsigned> asVector;
00095     for (typename T::const_iterator pairIterator = listOfPairs.begin();
00096          pairIterator != listOfPairs.end(); ++pairIterator) {
00097       unsigned a = pairIterator->first;
00098       unsigned b = pairIterator->second;
00099       for (unsigned curVal = a; curVal <= b; ++curVal) {
00100         asVector.push_back(curVal);
00101       }
00102     }
00103     return asVector;
00104   }
00105 }
00106 
00107 //              ----------------------------------------
00108 //              -- Public Function Member Definitions --
00109 //              ----------------------------------------
00110 
00111 namespace IData {
00112 
00113 // static data members
00114 Dataset::Key2Val Dataset::s_key2val;      ///< Application-wide options
00115 unsigned Dataset::s_expId(0);             ///< Application-wide experiment ID
00116 std::string Dataset::s_instrName;         ///< Application-wide instrument name
00117 std::string Dataset::s_expName;           ///< Application-wide experiment name
00118 
00119 
00120 /**
00121  *  @brief Sets application-wide experiment name.
00122  *
00123  *  Experiment name can be specified with the syntax acceptable for exp key.
00124  *  Individual datasets can override application-wide value.
00125  *
00126  *  @param[in] exp  new application-wide experiment name
00127  *
00128  *  @throw ExpNameException thrown if specified name is not found
00129  */
00130 void
00131 Dataset::setAppExpName(const std::string& exp)
00132 {
00133   if (not ::parseExpName(exp, s_expId, s_instrName, s_expName)) {
00134     throw IData::ExpNameException(ERR_LOC, exp);
00135   }
00136   s_key2val["exp"] = exp;
00137 }
00138 
00139 /**
00140  *  @brief Sets default application-wide option.
00141  *
00142  *  Sets default application-wide value for an option.
00143  *  Individual datasets can override application-wide values.
00144  *
00145  *  @param[in] key   Key name
00146  *  @param[in] value New application-wide value for this key
00147  */
00148 void
00149 Dataset::setDefOption(const std::string& key, const std::string& value)
00150 {
00151   if (key == "exp") {
00152     if (not ::parseExpName(value, s_expId, s_instrName, s_expName)) {
00153       throw IData::ExpNameException(ERR_LOC, value);
00154     }
00155   } else if (key == "run") {
00156     MsgLog(logger, warning, "setDefOption() does not accept run numbers");
00157   }
00158   s_key2val[key] = value;
00159 }
00160 
00161 //----------------
00162 // Constructors --
00163 //----------------
00164 Dataset::Dataset()
00165   : m_isFile(false)
00166   , m_key2val()
00167   , m_runs()
00168   , m_expId(0)
00169   , m_instrName()
00170   , m_expName()
00171   , m_files()
00172 {
00173 }
00174 
00175 Dataset::Dataset(const std::string& ds)
00176   : m_isFile(false)
00177   , m_key2val()
00178   , m_runs()
00179   , m_expId(0)
00180   , m_instrName()
00181   , m_expName()
00182   , m_files()
00183 {
00184   if (::isFileName(ds)) {
00185  
00186     // parse file names with good extensions
00187     if (boost::ends_with(ds, ".xtc")) {
00188       parseXtcFileName(ds);
00189     } else if (boost::ends_with(ds, ".h5")) {
00190       parseHdfFileName(ds);
00191     }
00192     
00193     // store the file name
00194     m_files.push_back(ds);
00195  
00196     m_isFile = true;
00197     
00198   } else {
00199 
00200     // must be a dataset, split it at colons
00201     std::vector<std::string> options;
00202     boost::split(options, ds, boost::is_any_of(":"), boost::token_compress_on);
00203     for (std::vector<std::string>::const_iterator it = options.begin(); it != options.end(); ++ it) {
00204   
00205       std::string option = *it;
00206       boost::trim(option);
00207       if (option.empty()) continue;
00208   
00209       std::string key(option);
00210       std::string val;
00211   
00212       std::string::size_type p = option.find('=');
00213       if (p != std::string::npos) {
00214         key.erase(p);
00215         boost::trim(key);
00216         val = option.substr(p+1);
00217         boost::trim(val);
00218       }
00219   
00220       if (key == "exp") {
00221         if (not ::parseExpName(val, m_expId, m_instrName, m_expName)) {
00222           throw IData::ExpNameException(ERR_LOC, val);
00223         }
00224       } else if (key == "run") {
00225         ::parseRuns(val, m_runs);
00226       } else if (key == "stream") {
00227         ::parseStreams(val, m_streams);
00228       }
00229       m_key2val[key] = val;
00230 
00231     }
00232   }
00233 
00234 }
00235 
00236 //--------------
00237 // Destructor --
00238 //--------------
00239 Dataset::~Dataset()
00240 {
00241 }
00242 
00243 /**
00244  *  @brief Returns true if the key is defined.
00245  *
00246  *  Key may be defined by either constructor or with a default
00247  *  application-wide option.
00248  *
00249  *  @param[in] key  Key name
00250  */
00251 bool
00252 Dataset::exists(const std::string& key) const
00253 {
00254   return m_key2val.find(key) != m_key2val.end() or
00255       s_key2val.find(key) != s_key2val.end();
00256 }
00257 
00258 /**
00259  *  @brief Returns value for given key or empty string.
00260  *
00261  *  @param[in] key  Key name
00262  */
00263 const std::string&
00264 Dataset::value(const std::string& key) const
00265 {
00266   // check my keys first
00267   Key2Val::const_iterator it = m_key2val.find(key);
00268   if (it != m_key2val.end()) return it->second;
00269 
00270   // check default keys
00271   it = s_key2val.find(key);
00272   if (it != s_key2val.end()) return it->second;
00273 
00274   // otherwise return empty string
00275   static std::string def;
00276   return def;
00277 }
00278 
00279 /// Returns experiment ID or 0 if it has not been defined.
00280 unsigned
00281 Dataset::expID() const
00282 {
00283   if (m_expId == 0) return s_expId;
00284   return m_expId;
00285 }
00286 
00287 /// Returns instrument name or empty string if it has not been defined.
00288 const std::string&
00289 Dataset::instrument() const
00290 {
00291   if (m_instrName.empty()) return s_instrName;
00292   return m_instrName;
00293 }
00294 
00295 /// Returns experiment name or empty string if it has not been defined.
00296 const std::string&
00297 Dataset::experiment() const
00298 {
00299   if (m_expName.empty()) return s_expName;
00300   return m_expName;
00301 }
00302 
00303 /// Returns set of run numbers
00304 const Dataset::Runs&
00305 Dataset::runs() const
00306 {
00307   return m_runs;
00308 }
00309 
00310 /// Returns set of stream ranges
00311 const Dataset::Streams&
00312 Dataset::streams() const
00313 {
00314   return m_streams;
00315 }
00316 
00317 /// Returns set of run numbers
00318 std::vector<unsigned> 
00319 Dataset::runsAsList() const
00320 {
00321   return listOfPairsAsVector(m_runs);
00322 }
00323 
00324 /// Returns set of stream ranges
00325 std::vector<unsigned>
00326 Dataset::streamsAsList() const
00327 {
00328   return listOfPairsAsVector(m_streams);
00329 }
00330 
00331 /// Return the directory name for files
00332 std::string 
00333 Dataset::dirName() const
00334 {
00335   // get directory name where to look for files
00336   std::string dir = this->value("dir");
00337   if (dir.empty()) {
00338     const char* type = this->exists("h5") ? "hdf5" : "xtc";
00339     boost::format fmt("%1%/%2%/%3%/%4%");
00340     const char* datadir = getenv("SIT_PSDM_DATA");
00341     if (datadir) {
00342       fmt % datadir % instrument() % experiment() % type;
00343       dir = fmt.str();
00344     } else {
00345       fmt % "/reg/d/psdm" % instrument() % experiment() % type;
00346       dir = fmt.str();
00347     }
00348   }
00349   return dir;
00350 }
00351 
00352 /// Return the list of file names for this dataset
00353 const Dataset::NameList& 
00354 Dataset::files() const
00355 {
00356   if (not m_files.empty()) return m_files;
00357 
00358   bool hdf5 = this->exists("h5");
00359   bool smd = this->exists("smd");
00360 
00361   // get directory name where to look for files
00362   std::string dir = this->dirName();
00363   if (smd) {
00364     dir += "/smalldata";
00365   }
00366   if (not fs::is_directory(dir)) {
00367     throw DatasetDirError(ERR_LOC, dir);
00368   }
00369 
00370   std::vector<unsigned> runsList = runsAsList();
00371   std::vector<unsigned> streamsList = streamsAsList();
00372   std::set<unsigned> runs(runsList.begin(), runsList.end());
00373   std::set<unsigned> streams(streamsList.begin(), streamsList.end());
00374 
00375   if (hdf5 and (streams.size()>0)) {
00376     MsgLog(logger, warning, "Stream specification ignored for matching hdf5 files");
00377   }
00378 
00379   // scan all files in directory, find matching ones
00380   std::map<unsigned, unsigned> filesPerRun;
00381   for (fs::directory_iterator fiter(dir); fiter != fs::directory_iterator(); ++ fiter) {
00382 
00383     const fs::path& path = fiter->path();
00384     const fs::path& basename = path.filename();
00385     
00386     boost::regex re = constructDataFileRegEx(hdf5, smd, experiment(), expID());
00387 
00388     // name should match and we only take regular files
00389     if (boost::regex_match(basename.string(), re) and fiter->status().type() == fs::regular_file) {
00390       boost::smatch what;
00391       boost::regex_search(basename.string(), what, re);
00392       unsigned run = boost::lexical_cast<unsigned>(std::string(what[1]));
00393       if (runs.find(run) != runs.end()) {
00394         bool streamMatch = true;
00395         if ((streams.size()>0) and (not hdf5)) {
00396           unsigned stream = boost::lexical_cast<unsigned>(std::string(what[2]));
00397           if (streams.find(stream) == streams.end()) {
00398             streamMatch = false;
00399           }
00400         }
00401         if (streamMatch) {
00402           MsgLog(logger, trace, "found matching file: " << path);
00403           m_files.push_back(path.string());
00404           ++ filesPerRun[run];
00405         }
00406       }
00407     }
00408   }
00409 
00410   // Check file count per run, issue warning for runs without files
00411   for (IData::Dataset::Runs::const_iterator ritr = m_runs.begin(); ritr != m_runs.end(); ++ ritr) {
00412     // only check runs specified explicitly, not ranges
00413     if (ritr->first == ritr->second) {
00414       if (filesPerRun[ritr->first] == 0) {
00415         MsgLog(logger, warning, "no input files found for run #" << ritr->first);
00416       }
00417     }
00418   }
00419 
00420   return m_files;
00421 }
00422 
00423 void 
00424 Dataset::parseXtcFileName(std::string path)
00425 {
00426   m_key2val["xtc"];
00427 
00428   // leave only basename
00429   std::string::size_type p = path.rfind('/');
00430   if (p != std::string::npos) path.erase(0, p+1);
00431   
00432   // drop extension
00433   p = path.rfind('.');
00434   if (p != std::string::npos) path.erase(p);
00435 
00436   // split into parts
00437   std::vector<std::string> parts;
00438   boost::split(parts, path, boost::is_any_of("-"), boost::token_compress_on);
00439   
00440   // need at least 2 pieces - experiment and run number
00441   if (parts.size() < 2) return;
00442     
00443   // first part is expected to be experiment id in format eNNNN
00444   if (parts[0].empty() or parts[0][0] != 'e') return;
00445   std::string expid(parts[0], 1);
00446 
00447   // must be all digits, and at least one digit
00448   if (expid.empty() or not boost::all(expid, boost::is_digit())) return;
00449 
00450   // second part is expected to be run number in format rNNNN
00451   if (parts[1].empty() or parts[1][0] != 'r') return;
00452   std::string run(parts[1], 1);
00453 
00454   // must be all digits, and at least one digit
00455   if (run.empty() or not boost::all(run, boost::is_digit())) return;
00456 
00457   // parse and store these
00458   if (not ::parseExpName(expid, m_expId, m_instrName, m_expName)) {
00459     MsgLog(logger, warning, "unrecognized experiment ID: " << expid);
00460   }
00461   ::parseRuns(run, m_runs);
00462 }
00463 
00464 
00465 void 
00466 Dataset::parseHdfFileName(std::string path)
00467 {
00468   m_key2val["h5"];
00469 
00470   // leave only basename
00471   std::string::size_type p = path.rfind('/');
00472   if (p != std::string::npos) path.erase(0, p+1);
00473   
00474   // drop extension
00475   p = path.rfind('.');
00476   if (p != std::string::npos) path.erase(p);
00477 
00478   // split into parts
00479   std::vector<std::string> parts;
00480   boost::split(parts, path, boost::is_any_of("-"), boost::token_compress_on);
00481   
00482   // need at least 2 pieces - experiment and run number
00483   if (parts.size() < 2) return;
00484     
00485   // first part is expected to be experiment name
00486   if (parts[0].empty()) return;
00487   std::string expname(parts[0]);
00488 
00489   // second part is expected to be run number in format rNNNN
00490   if (parts[1].empty() or parts[1][0] != 'r') return;
00491   std::string run(parts[1], 1);
00492 
00493   // must be all digits, and at least one digit
00494   if (run.empty() or not boost::all(run, boost::is_digit())) return;
00495 
00496   // parse and store these
00497   if (not ::parseExpName(expname, m_expId, m_instrName, m_expName)) {
00498     MsgLog(logger, warning, "unrecognized experiment name: " << expname);
00499   }
00500   ::parseRuns(run, m_runs);
00501 }
00502 
00503 } // namespace IData
00504 
00505 namespace {
00506 
00507 
00508 // parse experiment name
00509 bool parseExpName(const std::string& exp, unsigned& expId, std::string& instrName, std::string& expName)
00510 {
00511   ExpNameDb::ExpNameDatabase namedb;
00512 
00513   if (boost::all(exp, boost::is_digit())) {
00514 
00515     // all digits, must be experiment id
00516     unsigned num = boost::lexical_cast<unsigned>(exp);
00517     std::pair<std::string, std::string> instrExp = namedb.getNames(num);
00518     if (instrExp.first.empty()) {
00519       return false;
00520     }
00521 
00522     expId = num;
00523     instrName = instrExp.first;
00524     expName = instrExp.second;
00525 
00526   } else {
00527 
00528     // experiment name optionally with instrument name
00529     std::string::size_type p = exp.find('/');
00530     if (p == std::string::npos) {
00531 
00532       // only experiment name is given
00533       std::pair<std::string, unsigned> instrExp = namedb.getInstrumentAndID(exp);
00534       if (instrExp.first.empty()) {
00535         return false;
00536       }
00537 
00538       expId = instrExp.second;
00539       instrName = instrExp.first;
00540       expName = exp;
00541 
00542     } else {
00543 
00544       // both instrument and experiment name is given
00545       const std::string instrument(exp, 0, p);
00546       const std::string experiment(exp, p+1);
00547 
00548       unsigned num = namedb.getID(instrument, experiment);
00549       if (num == 0) {
00550         return false;
00551       }
00552 
00553       expId = num;
00554       instrName = instrument;
00555       expName = experiment;
00556 
00557     }
00558 
00559 
00560   }
00561 
00562   return true;
00563 }
00564 
00565 // parse run list
00566 void parseRuns(const std::string& str, IData::Dataset::Runs& runs)
00567 {
00568   runs.clear();
00569 
00570   // split it at commas
00571   std::vector<std::string> ranges;
00572   boost::split(ranges, str, boost::is_any_of(","), boost::token_compress_on);
00573   for (std::vector<std::string>::const_iterator it = ranges.begin(); it != ranges.end(); ++ it) {
00574 
00575     std::string range = *it;
00576     boost::trim(range);
00577     if (range.empty()) continue;
00578 
00579     std::string startStr(range);
00580     std::string endStr;
00581 
00582     std::string::size_type p = range.find('-');
00583     if (p != std::string::npos) {
00584       startStr.erase(p);
00585       boost::trim(startStr);
00586       endStr = range.substr(p+1);
00587       boost::trim(endStr);
00588     }
00589 
00590     unsigned start, end;
00591     try {
00592       start = boost::lexical_cast<unsigned>(startStr);
00593       if (endStr.empty()) {
00594         end = start;
00595       } else {
00596         end = boost::lexical_cast<unsigned>(endStr);
00597       }
00598     } catch (const boost::bad_lexical_cast& ex) {
00599       throw IData::RunNumberSpecException(ERR_LOC, str, ex.what());
00600     }
00601 
00602     runs.push_back(IData::Dataset::Runs::value_type(start, end));
00603 
00604   }
00605 
00606 }
00607 
00608 // parse stream list
00609 void parseStreams(const std::string& str, IData::Dataset::Streams& streams)
00610 {
00611   streams.clear();
00612 
00613   // split it at commas
00614   std::vector<std::string> ranges;
00615   boost::split(ranges, str, boost::is_any_of(","), boost::token_compress_on);
00616   for (std::vector<std::string>::const_iterator it = ranges.begin(); it != ranges.end(); ++ it) {
00617 
00618     std::string range = *it;
00619     boost::trim(range);
00620     if (range.empty()) continue;
00621 
00622     std::string startStr(range);
00623     std::string endStr;
00624 
00625     std::string::size_type p = range.find('-');
00626     if (p != std::string::npos) {
00627       startStr.erase(p);
00628       boost::trim(startStr);
00629       endStr = range.substr(p+1);
00630       boost::trim(endStr);
00631     }
00632 
00633     unsigned start, end;
00634     try {
00635       start = boost::lexical_cast<unsigned>(startStr);
00636       if (endStr.empty()) {
00637         end = start;
00638       } else {
00639         end = boost::lexical_cast<unsigned>(endStr);
00640       }
00641     } catch (const boost::bad_lexical_cast& ex) {
00642       throw IData::StreamRangeSpecException(ERR_LOC, str, ex.what());
00643     }
00644     if (end < start) {
00645       throw IData::StreamRangeSpecException(ERR_LOC, str, "the first number in the range must be less or equal to the last one");
00646     }
00647 
00648     streams.push_back(IData::Dataset::Streams::value_type(start, end));
00649 
00650   }
00651 
00652 }
00653 
00654 
00655 // checks to see if the string is a file name
00656 bool 
00657 isFileName(const std::string& str)
00658 {
00659   std::string::size_type col = str.find(':');
00660   if (col == std::string::npos) {
00661 
00662     // no colons but an equal sign - should be an option
00663     if (str.find('=') != std::string::npos) return false;
00664     
00665     // no colons and either dots or slashes - must be a file,
00666     // no colons and no dots, no slashes - assume it's an option
00667     return str.find_first_of("./") != std::string::npos;
00668   
00669   } else {
00670   
00671     // there are colons, if they are followed by slash or digits still fine for a file name
00672     // (expect URL-type names to be supported in the future)
00673     while (col != std::string::npos) {
00674       
00675       if (col == str.size()-1) {
00676         // last character is column, cannot be file name
00677         return false;
00678       }
00679       
00680       if (str[col+1] != '/' and not std::isdigit(str[col+1])) {
00681         // colon followed by something other than / or digit, not a file
00682         return false;
00683       }
00684       
00685       // move to next one
00686       col = str.find(':', col+1);
00687     }
00688   
00689     return true;
00690   }
00691   
00692 }
00693 
00694 }

Generated on 19 Dec 2016 for PSANAclasses by  doxygen 1.4.7