00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 #include "IData/Dataset.h"
00017
00018
00019
00020
00021 #include <cctype>
00022 #include <boost/lexical_cast.hpp>
00023 #include <boost/algorithm/string.hpp>
00024 #include <boost/filesystem.hpp>
00025 #include <boost/regex.hpp>
00026 #include <boost/format.hpp>
00027
00028
00029
00030
00031 #include "ExpNameDb/ExpNameDatabase.h"
00032 #include "IData/Exceptions.h"
00033 #include "MsgLogger/MsgLogger.h"
00034
00035
00036
00037
00038
00039 namespace fs = boost::filesystem;
00040
00041 namespace {
00042
00043 const char* logger = "IData.Dataset";
00044
00045
00046 bool parseExpName(const std::string& exp, unsigned& expId, std::string& instrName, std::string& expName);
00047
00048
00049 void parseRuns(const std::string& str, IData::Dataset::Runs& runs);
00050
00051
00052 void parseStreams(const std::string& str, IData::Dataset::Streams& streams);
00053
00054
00055 bool isFileName(const std::string& str);
00056
00057
00058
00059
00060 template <class T>
00061 std::set<unsigned> listOfPairsAsSet(const T &listOfPairs) {
00062 std::set<unsigned> asSet;
00063 for (typename T::const_iterator pairIterator = listOfPairs.begin();
00064 pairIterator != listOfPairs.end(); ++pairIterator) {
00065 unsigned a = pairIterator->first;
00066 unsigned b = pairIterator->second;
00067 for (unsigned curVal = a; curVal <= b; ++curVal) {
00068 asSet.insert(curVal);
00069 }
00070 }
00071 return asSet;
00072 }
00073
00074
00075
00076
00077 boost::regex constructDataFileRegEx(bool hdf5, bool smd, const std::string &experiment, unsigned expID) {
00078 std::string reStr;
00079 if (hdf5) {
00080 reStr = boost::str(boost::format("%1%-r0*(\\d+)(-.*)?[.]h5") % experiment);
00081 } else if (smd) {
00082 reStr = boost::str(boost::format("e%1%-r0*(\\d+)-s([0-9]+)-c[0-9]+[.]smd[.]xtc") % expID);
00083 } else {
00084 reStr = boost::str(boost::format("e%1%-r0*(\\d+)-s([0-9]+)-c[0-9]+[.]xtc") % expID);
00085 }
00086 boost::regex re(reStr);
00087 return re;
00088 }
00089
00090
00091
00092 template <class T>
00093 std::vector<unsigned> listOfPairsAsVector(const T &listOfPairs) {
00094 std::vector<unsigned> asVector;
00095 for (typename T::const_iterator pairIterator = listOfPairs.begin();
00096 pairIterator != listOfPairs.end(); ++pairIterator) {
00097 unsigned a = pairIterator->first;
00098 unsigned b = pairIterator->second;
00099 for (unsigned curVal = a; curVal <= b; ++curVal) {
00100 asVector.push_back(curVal);
00101 }
00102 }
00103 return asVector;
00104 }
00105 }
00106
00107
00108
00109
00110
00111 namespace IData {
00112
00113
00114 Dataset::Key2Val Dataset::s_key2val;
00115 unsigned Dataset::s_expId(0);
00116 std::string Dataset::s_instrName;
00117 std::string Dataset::s_expName;
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130 void
00131 Dataset::setAppExpName(const std::string& exp)
00132 {
00133 if (not ::parseExpName(exp, s_expId, s_instrName, s_expName)) {
00134 throw IData::ExpNameException(ERR_LOC, exp);
00135 }
00136 s_key2val["exp"] = exp;
00137 }
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148 void
00149 Dataset::setDefOption(const std::string& key, const std::string& value)
00150 {
00151 if (key == "exp") {
00152 if (not ::parseExpName(value, s_expId, s_instrName, s_expName)) {
00153 throw IData::ExpNameException(ERR_LOC, value);
00154 }
00155 } else if (key == "run") {
00156 MsgLog(logger, warning, "setDefOption() does not accept run numbers");
00157 }
00158 s_key2val[key] = value;
00159 }
00160
00161
00162
00163
00164 Dataset::Dataset()
00165 : m_isFile(false)
00166 , m_key2val()
00167 , m_runs()
00168 , m_expId(0)
00169 , m_instrName()
00170 , m_expName()
00171 , m_files()
00172 {
00173 }
00174
00175 Dataset::Dataset(const std::string& ds)
00176 : m_isFile(false)
00177 , m_key2val()
00178 , m_runs()
00179 , m_expId(0)
00180 , m_instrName()
00181 , m_expName()
00182 , m_files()
00183 {
00184 if (::isFileName(ds)) {
00185
00186
00187 if (boost::ends_with(ds, ".xtc")) {
00188 parseXtcFileName(ds);
00189 } else if (boost::ends_with(ds, ".h5")) {
00190 parseHdfFileName(ds);
00191 }
00192
00193
00194 m_files.push_back(ds);
00195
00196 m_isFile = true;
00197
00198 } else {
00199
00200
00201 std::vector<std::string> options;
00202 boost::split(options, ds, boost::is_any_of(":"), boost::token_compress_on);
00203 for (std::vector<std::string>::const_iterator it = options.begin(); it != options.end(); ++ it) {
00204
00205 std::string option = *it;
00206 boost::trim(option);
00207 if (option.empty()) continue;
00208
00209 std::string key(option);
00210 std::string val;
00211
00212 std::string::size_type p = option.find('=');
00213 if (p != std::string::npos) {
00214 key.erase(p);
00215 boost::trim(key);
00216 val = option.substr(p+1);
00217 boost::trim(val);
00218 }
00219
00220 if (key == "exp") {
00221 if (not ::parseExpName(val, m_expId, m_instrName, m_expName)) {
00222 throw IData::ExpNameException(ERR_LOC, val);
00223 }
00224 } else if (key == "run") {
00225 ::parseRuns(val, m_runs);
00226 } else if (key == "stream") {
00227 ::parseStreams(val, m_streams);
00228 }
00229 m_key2val[key] = val;
00230
00231 }
00232 }
00233
00234 }
00235
00236
00237
00238
00239 Dataset::~Dataset()
00240 {
00241 }
00242
00243
00244
00245
00246
00247
00248
00249
00250
00251 bool
00252 Dataset::exists(const std::string& key) const
00253 {
00254 return m_key2val.find(key) != m_key2val.end() or
00255 s_key2val.find(key) != s_key2val.end();
00256 }
00257
00258
00259
00260
00261
00262
00263 const std::string&
00264 Dataset::value(const std::string& key) const
00265 {
00266
00267 Key2Val::const_iterator it = m_key2val.find(key);
00268 if (it != m_key2val.end()) return it->second;
00269
00270
00271 it = s_key2val.find(key);
00272 if (it != s_key2val.end()) return it->second;
00273
00274
00275 static std::string def;
00276 return def;
00277 }
00278
00279
00280 unsigned
00281 Dataset::expID() const
00282 {
00283 if (m_expId == 0) return s_expId;
00284 return m_expId;
00285 }
00286
00287
00288 const std::string&
00289 Dataset::instrument() const
00290 {
00291 if (m_instrName.empty()) return s_instrName;
00292 return m_instrName;
00293 }
00294
00295
00296 const std::string&
00297 Dataset::experiment() const
00298 {
00299 if (m_expName.empty()) return s_expName;
00300 return m_expName;
00301 }
00302
00303
00304 const Dataset::Runs&
00305 Dataset::runs() const
00306 {
00307 return m_runs;
00308 }
00309
00310
00311 const Dataset::Streams&
00312 Dataset::streams() const
00313 {
00314 return m_streams;
00315 }
00316
00317
00318 std::vector<unsigned>
00319 Dataset::runsAsList() const
00320 {
00321 return listOfPairsAsVector(m_runs);
00322 }
00323
00324
00325 std::vector<unsigned>
00326 Dataset::streamsAsList() const
00327 {
00328 return listOfPairsAsVector(m_streams);
00329 }
00330
00331
00332 std::string
00333 Dataset::dirName() const
00334 {
00335
00336 std::string dir = this->value("dir");
00337 if (dir.empty()) {
00338 const char* type = this->exists("h5") ? "hdf5" : "xtc";
00339 boost::format fmt("%1%/%2%/%3%/%4%");
00340 const char* datadir = getenv("SIT_PSDM_DATA");
00341 if (datadir) {
00342 fmt % datadir % instrument() % experiment() % type;
00343 dir = fmt.str();
00344 } else {
00345 fmt % "/reg/d/psdm" % instrument() % experiment() % type;
00346 dir = fmt.str();
00347 }
00348 }
00349 return dir;
00350 }
00351
00352
00353 const Dataset::NameList&
00354 Dataset::files() const
00355 {
00356 if (not m_files.empty()) return m_files;
00357
00358 bool hdf5 = this->exists("h5");
00359 bool smd = this->exists("smd");
00360
00361
00362 std::string dir = this->dirName();
00363 if (smd) {
00364 dir += "/smalldata";
00365 }
00366 if (not fs::is_directory(dir)) {
00367 throw DatasetDirError(ERR_LOC, dir);
00368 }
00369
00370 std::vector<unsigned> runsList = runsAsList();
00371 std::vector<unsigned> streamsList = streamsAsList();
00372 std::set<unsigned> runs(runsList.begin(), runsList.end());
00373 std::set<unsigned> streams(streamsList.begin(), streamsList.end());
00374
00375 if (hdf5 and (streams.size()>0)) {
00376 MsgLog(logger, warning, "Stream specification ignored for matching hdf5 files");
00377 }
00378
00379
00380 std::map<unsigned, unsigned> filesPerRun;
00381 for (fs::directory_iterator fiter(dir); fiter != fs::directory_iterator(); ++ fiter) {
00382
00383 const fs::path& path = fiter->path();
00384 const fs::path& basename = path.filename();
00385
00386 boost::regex re = constructDataFileRegEx(hdf5, smd, experiment(), expID());
00387
00388
00389 if (boost::regex_match(basename.string(), re) and fiter->status().type() == fs::regular_file) {
00390 boost::smatch what;
00391 boost::regex_search(basename.string(), what, re);
00392 unsigned run = boost::lexical_cast<unsigned>(std::string(what[1]));
00393 if (runs.find(run) != runs.end()) {
00394 bool streamMatch = true;
00395 if ((streams.size()>0) and (not hdf5)) {
00396 unsigned stream = boost::lexical_cast<unsigned>(std::string(what[2]));
00397 if (streams.find(stream) == streams.end()) {
00398 streamMatch = false;
00399 }
00400 }
00401 if (streamMatch) {
00402 MsgLog(logger, trace, "found matching file: " << path);
00403 m_files.push_back(path.string());
00404 ++ filesPerRun[run];
00405 }
00406 }
00407 }
00408 }
00409
00410
00411 for (IData::Dataset::Runs::const_iterator ritr = m_runs.begin(); ritr != m_runs.end(); ++ ritr) {
00412
00413 if (ritr->first == ritr->second) {
00414 if (filesPerRun[ritr->first] == 0) {
00415 MsgLog(logger, warning, "no input files found for run #" << ritr->first);
00416 }
00417 }
00418 }
00419
00420 return m_files;
00421 }
00422
00423 void
00424 Dataset::parseXtcFileName(std::string path)
00425 {
00426 m_key2val["xtc"];
00427
00428
00429 std::string::size_type p = path.rfind('/');
00430 if (p != std::string::npos) path.erase(0, p+1);
00431
00432
00433 p = path.rfind('.');
00434 if (p != std::string::npos) path.erase(p);
00435
00436
00437 std::vector<std::string> parts;
00438 boost::split(parts, path, boost::is_any_of("-"), boost::token_compress_on);
00439
00440
00441 if (parts.size() < 2) return;
00442
00443
00444 if (parts[0].empty() or parts[0][0] != 'e') return;
00445 std::string expid(parts[0], 1);
00446
00447
00448 if (expid.empty() or not boost::all(expid, boost::is_digit())) return;
00449
00450
00451 if (parts[1].empty() or parts[1][0] != 'r') return;
00452 std::string run(parts[1], 1);
00453
00454
00455 if (run.empty() or not boost::all(run, boost::is_digit())) return;
00456
00457
00458 if (not ::parseExpName(expid, m_expId, m_instrName, m_expName)) {
00459 MsgLog(logger, warning, "unrecognized experiment ID: " << expid);
00460 }
00461 ::parseRuns(run, m_runs);
00462 }
00463
00464
00465 void
00466 Dataset::parseHdfFileName(std::string path)
00467 {
00468 m_key2val["h5"];
00469
00470
00471 std::string::size_type p = path.rfind('/');
00472 if (p != std::string::npos) path.erase(0, p+1);
00473
00474
00475 p = path.rfind('.');
00476 if (p != std::string::npos) path.erase(p);
00477
00478
00479 std::vector<std::string> parts;
00480 boost::split(parts, path, boost::is_any_of("-"), boost::token_compress_on);
00481
00482
00483 if (parts.size() < 2) return;
00484
00485
00486 if (parts[0].empty()) return;
00487 std::string expname(parts[0]);
00488
00489
00490 if (parts[1].empty() or parts[1][0] != 'r') return;
00491 std::string run(parts[1], 1);
00492
00493
00494 if (run.empty() or not boost::all(run, boost::is_digit())) return;
00495
00496
00497 if (not ::parseExpName(expname, m_expId, m_instrName, m_expName)) {
00498 MsgLog(logger, warning, "unrecognized experiment name: " << expname);
00499 }
00500 ::parseRuns(run, m_runs);
00501 }
00502
00503 }
00504
00505 namespace {
00506
00507
00508
00509 bool parseExpName(const std::string& exp, unsigned& expId, std::string& instrName, std::string& expName)
00510 {
00511 ExpNameDb::ExpNameDatabase namedb;
00512
00513 if (boost::all(exp, boost::is_digit())) {
00514
00515
00516 unsigned num = boost::lexical_cast<unsigned>(exp);
00517 std::pair<std::string, std::string> instrExp = namedb.getNames(num);
00518 if (instrExp.first.empty()) {
00519 return false;
00520 }
00521
00522 expId = num;
00523 instrName = instrExp.first;
00524 expName = instrExp.second;
00525
00526 } else {
00527
00528
00529 std::string::size_type p = exp.find('/');
00530 if (p == std::string::npos) {
00531
00532
00533 std::pair<std::string, unsigned> instrExp = namedb.getInstrumentAndID(exp);
00534 if (instrExp.first.empty()) {
00535 return false;
00536 }
00537
00538 expId = instrExp.second;
00539 instrName = instrExp.first;
00540 expName = exp;
00541
00542 } else {
00543
00544
00545 const std::string instrument(exp, 0, p);
00546 const std::string experiment(exp, p+1);
00547
00548 unsigned num = namedb.getID(instrument, experiment);
00549 if (num == 0) {
00550 return false;
00551 }
00552
00553 expId = num;
00554 instrName = instrument;
00555 expName = experiment;
00556
00557 }
00558
00559
00560 }
00561
00562 return true;
00563 }
00564
00565
00566 void parseRuns(const std::string& str, IData::Dataset::Runs& runs)
00567 {
00568 runs.clear();
00569
00570
00571 std::vector<std::string> ranges;
00572 boost::split(ranges, str, boost::is_any_of(","), boost::token_compress_on);
00573 for (std::vector<std::string>::const_iterator it = ranges.begin(); it != ranges.end(); ++ it) {
00574
00575 std::string range = *it;
00576 boost::trim(range);
00577 if (range.empty()) continue;
00578
00579 std::string startStr(range);
00580 std::string endStr;
00581
00582 std::string::size_type p = range.find('-');
00583 if (p != std::string::npos) {
00584 startStr.erase(p);
00585 boost::trim(startStr);
00586 endStr = range.substr(p+1);
00587 boost::trim(endStr);
00588 }
00589
00590 unsigned start, end;
00591 try {
00592 start = boost::lexical_cast<unsigned>(startStr);
00593 if (endStr.empty()) {
00594 end = start;
00595 } else {
00596 end = boost::lexical_cast<unsigned>(endStr);
00597 }
00598 } catch (const boost::bad_lexical_cast& ex) {
00599 throw IData::RunNumberSpecException(ERR_LOC, str, ex.what());
00600 }
00601
00602 runs.push_back(IData::Dataset::Runs::value_type(start, end));
00603
00604 }
00605
00606 }
00607
00608
00609 void parseStreams(const std::string& str, IData::Dataset::Streams& streams)
00610 {
00611 streams.clear();
00612
00613
00614 std::vector<std::string> ranges;
00615 boost::split(ranges, str, boost::is_any_of(","), boost::token_compress_on);
00616 for (std::vector<std::string>::const_iterator it = ranges.begin(); it != ranges.end(); ++ it) {
00617
00618 std::string range = *it;
00619 boost::trim(range);
00620 if (range.empty()) continue;
00621
00622 std::string startStr(range);
00623 std::string endStr;
00624
00625 std::string::size_type p = range.find('-');
00626 if (p != std::string::npos) {
00627 startStr.erase(p);
00628 boost::trim(startStr);
00629 endStr = range.substr(p+1);
00630 boost::trim(endStr);
00631 }
00632
00633 unsigned start, end;
00634 try {
00635 start = boost::lexical_cast<unsigned>(startStr);
00636 if (endStr.empty()) {
00637 end = start;
00638 } else {
00639 end = boost::lexical_cast<unsigned>(endStr);
00640 }
00641 } catch (const boost::bad_lexical_cast& ex) {
00642 throw IData::StreamRangeSpecException(ERR_LOC, str, ex.what());
00643 }
00644 if (end < start) {
00645 throw IData::StreamRangeSpecException(ERR_LOC, str, "the first number in the range must be less or equal to the last one");
00646 }
00647
00648 streams.push_back(IData::Dataset::Streams::value_type(start, end));
00649
00650 }
00651
00652 }
00653
00654
00655
00656 bool
00657 isFileName(const std::string& str)
00658 {
00659 std::string::size_type col = str.find(':');
00660 if (col == std::string::npos) {
00661
00662
00663 if (str.find('=') != std::string::npos) return false;
00664
00665
00666
00667 return str.find_first_of("./") != std::string::npos;
00668
00669 } else {
00670
00671
00672
00673 while (col != std::string::npos) {
00674
00675 if (col == str.size()-1) {
00676
00677 return false;
00678 }
00679
00680 if (str[col+1] != '/' and not std::isdigit(str[col+1])) {
00681
00682 return false;
00683 }
00684
00685
00686 col = str.find(':', col+1);
00687 }
00688
00689 return true;
00690 }
00691
00692 }
00693
00694 }