/***************************************************************************** subtractBed.cpp (c) 2009 - Aaron Quinlan Hall Laboratory Department of Biochemistry and Molecular Genetics University of Virginia aaronquinlan@gmail.com Licenced under the GNU General Public License 2.0 license. ******************************************************************************/ #include "lineFileUtilities.h" #include "subtractBed.h" /* Constructor */ BedSubtract::BedSubtract(string &bedAFile, string &bedBFile, float &overlapFraction, bool &forceStrand) { _bedAFile = bedAFile; _bedBFile = bedBFile; _overlapFraction = overlapFraction; _forceStrand = forceStrand; _bedA = new BedFile(bedAFile); _bedB = new BedFile(bedBFile); SubtractBed(); } /* Destructor */ BedSubtract::~BedSubtract(void) { } void BedSubtract::FindAndSubtractOverlaps(BED &a, vector &hits) { // find all of the overlaps between a and B. _bedB->FindOverlapsPerBin(a.chrom, a.start, a.end, a.strand, hits, _forceStrand); // is A completely spanned by an entry in B? // if so, A should not be reported. int numConsumedByB = 0; int numOverlaps = 0; vector bOverlaps; // list of hits in B. Special processing if there are multiple. vector::const_iterator h = hits.begin(); vector::const_iterator hitsEnd = hits.end(); for (; h != hitsEnd; ++h) { int s = max(a.start, h->start); int e = min(a.end, h->end); int overlapBases = (e - s); // the number of overlapping bases b/w a and b int aLength = (a.end - a.start); // the length of a in b.p. if (s < e) { // is there enough overlap (default ~ 1bp) float overlap = ((float) overlapBases / (float) aLength); if (overlap >= 1.0) { numOverlaps++; numConsumedByB++; } else if ( overlap >= _overlapFraction ) { numOverlaps++; bOverlaps.push_back(*h); } } } if (numOverlaps == 0) { // no overlap found, so just report A as-is. _bedA->reportBedNewLine(a); } else if (numOverlaps == 1) { // one overlap found. only need to look at the single // entry in bOverlaps. // if A was not "consumed" by any entry in B if (numConsumedByB == 0) { BED theHit = bOverlaps[0]; // A ++++++++++++ // B ---- // Res. ==== ==== if ( (theHit.start > a.start) && (theHit.end < a.end) ) { _bedA->reportBedRangeNewLine(a,a.start,theHit.start); _bedA->reportBedRangeNewLine(a,theHit.end,a.end); } // A ++++++++++++ // B ---------- // Res. == else if (theHit.start == a.start) { _bedA->reportBedRangeNewLine(a,theHit.end,a.end); } // A ++++++++++++ // B ---------- // Res. ==== else if (theHit.start < a.start) { _bedA->reportBedRangeNewLine(a,theHit.end,a.end); } // A ++++++++++++ // B ---------- // Res. ======= else if (theHit.start > a.start) { _bedA->reportBedRangeNewLine(a,a.start,theHit.start); } } } else if (numOverlaps > 1) { // multiple overlapz found. look at all the hits // and figure out which bases in A survived. then // report the contigous intervals that survived. vector aKeep(a.end - a.start, true); if (numConsumedByB == 0) { // track the number of hit starts and ends at each position in A for (vector::iterator h = bOverlaps.begin(); h != bOverlaps.end(); ++h) { int s = max(a.start, h->start); int e = min(a.end, h->end); for (int i = s+1; i <= e; ++i) { aKeep[i-a.start-1] = false; } } // report the remaining blocks. for (unsigned int i = 0; i < aKeep.size(); ++i) { if (aKeep[i] == true) { CHRPOS blockStart = i + a.start; while ((aKeep[i] == true) && (i < aKeep.size())) { i++; } CHRPOS blockEnd = i + a.start; blockEnd = min(a.end, blockEnd); _bedA->reportBedRangeNewLine(a,blockStart,blockEnd); } } } } } void BedSubtract::SubtractBed() { // load the "B" bed file into a map so // that we can easily compare "A" to it for overlaps _bedB->loadBedFileIntoMap(); BED a, nullBed; BedLineStatus bedStatus; int lineNum = 0; // current input line number vector hits; // vector of potential hits // reserve some space hits.reserve(100); _bedA->Open(); while ((bedStatus = _bedA->GetNextBed(a, lineNum)) != BED_INVALID) { if (bedStatus == BED_VALID) { FindAndSubtractOverlaps(a, hits); hits.clear(); a = nullBed; } } _bedA->Close(); } // END Intersect