外觀的數量對於k聚https://en.wikipedia.org/wiki/K-mer查找最常見的k聚體和它們在C++
我試圖找到一個大的fastq文件最常見的k聚體。我的計劃是使用MISRA-格里斯算法來找出最頻繁的k聚體,然後用第二遍搜索每一個頻繁k鏈節的數量在文件中。但我認爲我的算法不夠高效。以下是我的第一份草案。我儘量內存儘可能高效。(程序不能運行內存不足)
我也發現了這個DSK算法,但是這個人是太難受,沒有看到一個簡單的實現理解。 http://minia.genouest.org/dsk/
注:另外每個櫃檯的ID將是整數不是字符串,我要在我的最後草案進行更改。
#include <iostream>
#include <fstream>
#include <sstream>
#include <vector>
using namespace std;
struct node {
string id;
int count;
};
void searchCount(vector <node>&memory, string line,int k) {
int count = 0;
string kMer;
for (int i = 0; i < memory.size(); i++) {
if (memory[i].id != "") {
for (int j = 0; j < line.length() - k + 1; j++) {
kMer = line.substr(j, k);
if (kMer == memory[i].id) {
count++;
}
}
}
memory[i].count = count;
count = 0;
}
}
int doWeHaveSpace(vector <node> memory) {
for (int i = 0; i < memory.size(); i++) {
if (memory[i].id == "") {
return i;
}
}
return -1;
}
void MisraGries(string element, vector <node> &memory) {
bool isHere = false;
int index;
for (int i = 0; i < memory.size(); i++) {
if (memory[i].id == element) {
isHere = true;
index = i;
}
}
if (isHere) {
memory[index].count++;
}
else {
int freeIndex = doWeHaveSpace(memory);
if (freeIndex+1) {
memory[freeIndex].count++;
memory[freeIndex].id = element;
}
else {
for (int i = 0; i < memory.size(); i++) {
if (memory[i].count != 0) {
memory[i].count--;
if (memory[i].count == 0) {
memory[i].id = "";
}
}
}
}
}
}
void filecheck(ifstream & input, string prompt) // this function checks and opens input files
{
string filename;
cout << "Please enter file directory and name for " << prompt << ": ";
do
{
getline(cin, filename);
input.open(filename.c_str());
if (input.fail())
cout << " wrong file directory. Please enter real directory. ";
} while (input.fail());
}
int main() {
int line = 1;
string filename;
ifstream input;
ofstream output;
string search;
vector <node> frequent(1000);
for (int i = 0; i < frequent.size(); i++) {
frequent[i].id = "";
frequent[i].count = 0;
}
int k = 30;
string kMer;
filecheck(input, "input file");
while (!input.eof())
{
getline(input, search); // it gets infos line by line to count lines
line++;
if (line == 3) {
for (int i = 0; i < search.length() - k + 1; i++) {
kMer = search.substr(i, k);
MisraGries(kMer, frequent);
}
line = -1;
}
}
return 0;
}
什麼是你的時間需求?真?我的第一遍是創建一個文件,其行是每個kmer。使用Unix排序實用程序進行外部排序,然後掃描它以查找我的數量。無效,當然。但是編寫簡單的代碼,簡單易懂,並且能夠在幾分鐘內處理100 GB文件。 – btilly