Academic Journal

How to optimally sample a sequence for rapid analysis

التفاصيل البيبلوغرافية
العنوان: How to optimally sample a sequence for rapid analysis
المؤلفون: Frith, Martin C, Shaw, Jim, Spouge, John L
المساهمون: Kelso, Janet, Japan Science and Technology Agency, National Library of Medicine, National Institutes of Health
المصدر: Bioinformatics ; volume 39, issue 2 ; ISSN 1367-4811
بيانات النشر: Oxford University Press (OUP)
سنة النشر: 2023
الوصف: Motivation We face an increasing flood of genetic sequence data, from diverse sources, requiring rapid computational analysis. Rapid analysis can be achieved by sampling a subset of positions in each sequence. Previous sequence-sampling methods, such as minimizers, syncmers and minimally overlapping words, were developed by heuristic intuition, and are not optimal. Results We present a sequence-sampling approach that provably optimizes sensitivity for a whole class of sequence comparison methods, for randomly evolving sequences. It is likely near-optimal for a wide range of alignment-based and alignment-free analyses. For real biological DNA, it increases specificity by avoiding simple repeats. Our approach generalizes universal hitting sets (which guarantee to sample a sequence at least once) and polar sets (which guarantee to sample a sequence at most once). This helps us understand how to do rapid sequence analysis as accurately as possible. Availability and implementation Source code is freely available at https://gitlab.com/mcfrith/noverlap. Supplementary information Supplementary data are available at Bioinformatics online.
نوع الوثيقة: article in journal/newspaper
اللغة: English
DOI: 10.1093/bioinformatics/btad057
DOI: 10.1093/bioinformatics/btad057/48907444/btad057.pdf
الاتاحة: http://dx.doi.org/10.1093/bioinformatics/btad057
https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btad057/48907444/btad057.pdf
https://academic.oup.com/bioinformatics/article-pdf/39/2/btad057/49124149/btad057.pdf
Rights: https://creativecommons.org/licenses/by/4.0/
رقم الانضمام: edsbas.CF51DC14
قاعدة البيانات: BASE
ResultId 1
Header edsbas
BASE
edsbas.CF51DC14
961
3
Academic Journal
academicJournal
960.976379394531
PLink https://search.ebscohost.com/login.aspx?direct=true&site=eds-live&scope=site&db=edsbas&AN=edsbas.CF51DC14&custid=s6537998&authtype=sso
FullText Array ( [Availability] => 0 )
Array ( [0] => Array ( [Url] => http://dx.doi.org/10.1093/bioinformatics/btad057# [Name] => EDS - BASE [Category] => fullText [Text] => View record in BASE [MouseOverText] => View record in BASE ) )
Items Array ( [Name] => Title [Label] => Title [Group] => Ti [Data] => How to optimally sample a sequence for rapid analysis )
Array ( [Name] => Author [Label] => Authors [Group] => Au [Data] => <searchLink fieldCode="AR" term="%22Frith%2C+Martin+C%22">Frith, Martin C</searchLink><br /><searchLink fieldCode="AR" term="%22Shaw%2C+Jim%22">Shaw, Jim</searchLink><br /><searchLink fieldCode="AR" term="%22Spouge%2C+John+L%22">Spouge, John L</searchLink> )
Array ( [Name] => Author [Label] => Contributors [Group] => Au [Data] => Kelso, Janet<br />Japan Science and Technology Agency<br />National Library of Medicine<br />National Institutes of Health )
Array ( [Name] => TitleSource [Label] => Source [Group] => Src [Data] => Bioinformatics ; volume 39, issue 2 ; ISSN 1367-4811 )
Array ( [Name] => Publisher [Label] => Publisher Information [Group] => PubInfo [Data] => Oxford University Press (OUP) )
Array ( [Name] => DatePubCY [Label] => Publication Year [Group] => Date [Data] => 2023 )
Array ( [Name] => Abstract [Label] => Description [Group] => Ab [Data] => Motivation We face an increasing flood of genetic sequence data, from diverse sources, requiring rapid computational analysis. Rapid analysis can be achieved by sampling a subset of positions in each sequence. Previous sequence-sampling methods, such as minimizers, syncmers and minimally overlapping words, were developed by heuristic intuition, and are not optimal. Results We present a sequence-sampling approach that provably optimizes sensitivity for a whole class of sequence comparison methods, for randomly evolving sequences. It is likely near-optimal for a wide range of alignment-based and alignment-free analyses. For real biological DNA, it increases specificity by avoiding simple repeats. Our approach generalizes universal hitting sets (which guarantee to sample a sequence at least once) and polar sets (which guarantee to sample a sequence at most once). This helps us understand how to do rapid sequence analysis as accurately as possible. Availability and implementation Source code is freely available at https://gitlab.com/mcfrith/noverlap. Supplementary information Supplementary data are available at Bioinformatics online. )
Array ( [Name] => TypeDocument [Label] => Document Type [Group] => TypDoc [Data] => article in journal/newspaper )
Array ( [Name] => Language [Label] => Language [Group] => Lang [Data] => English )
Array ( [Name] => DOI [Label] => DOI [Group] => ID [Data] => 10.1093/bioinformatics/btad057 )
Array ( [Name] => DOI [Label] => DOI [Group] => ID [Data] => 10.1093/bioinformatics/btad057/48907444/btad057.pdf )
Array ( [Name] => URL [Label] => Availability [Group] => URL [Data] => http://dx.doi.org/10.1093/bioinformatics/btad057<br />https://academic.oup.com/bioinformatics/advance-article-pdf/doi/10.1093/bioinformatics/btad057/48907444/btad057.pdf<br />https://academic.oup.com/bioinformatics/article-pdf/39/2/btad057/49124149/btad057.pdf )
Array ( [Name] => Copyright [Label] => Rights [Group] => Cpyrght [Data] => https://creativecommons.org/licenses/by/4.0/ )
Array ( [Name] => AN [Label] => Accession Number [Group] => ID [Data] => edsbas.CF51DC14 )
RecordInfo Array ( [BibEntity] => Array ( [Identifiers] => Array ( [0] => Array ( [Type] => doi [Value] => 10.1093/bioinformatics/btad057 ) ) [Languages] => Array ( [0] => Array ( [Text] => English ) ) [Titles] => Array ( [0] => Array ( [TitleFull] => How to optimally sample a sequence for rapid analysis [Type] => main ) ) ) [BibRelationships] => Array ( [HasContributorRelationships] => Array ( [0] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Frith, Martin C ) ) ) [1] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Shaw, Jim ) ) ) [2] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Spouge, John L ) ) ) [3] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Kelso, Janet ) ) ) [4] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => Japan Science and Technology Agency ) ) ) [5] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => National Library of Medicine ) ) ) [6] => Array ( [PersonEntity] => Array ( [Name] => Array ( [NameFull] => National Institutes of Health ) ) ) ) [IsPartOfRelationships] => Array ( [0] => Array ( [BibEntity] => Array ( [Dates] => Array ( [0] => Array ( [D] => 01 [M] => 01 [Type] => published [Y] => 2023 ) ) [Identifiers] => Array ( [0] => Array ( [Type] => issn-locals [Value] => edsbas ) [1] => Array ( [Type] => issn-locals [Value] => edsbas.oa ) ) [Titles] => Array ( [0] => Array ( [TitleFull] => Bioinformatics ; volume 39, issue 2 ; ISSN 1367-4811 [Type] => main ) ) ) ) ) ) )
IllustrationInfo