<ddi:DDIInstance xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="ddi:instance:3_3 http://ddialliance.org/Specification/DDI-Lifecycle/3.3/XMLSchema/instance.xsd" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:ddi="ddi:instance:3_3" xmlns:r="ddi:reusable:3_3" xmlns:s="ddi:studyunit:3_3" xmlns:d="ddi:datacollection:3_3" xmlns:a="ddi:archive:3_3" xmlns:c="ddi:conceptualcomponent:3_3" xmlns:cm="ddi:comparative:3_3" xmlns:g="ddi:group:3_3" xmlns:l="ddi:logicalproduct:3_3" xmlns:p="ddi:physicaldataproduct:3_3" xmlns:pi="ddi:physicalinstance:3_3" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:xml="http://www.w3.org/XML/1998/namespace" isMaintainable="true" scopeOfUniqueness="Agency">
  <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814:0</r:URN>
  <r:Agency>SND</r:Agency>
  <r:ID>doi-10-17044-scilifelab-28606814</r:ID>
  <r:Version>0</r:Version>
  <g:ResourcePackage>
    <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.ResourcePackage:2.0</r:URN>
    <r:OtherMaterialScheme>
      <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.OtherMaterialScheme:2.0</r:URN>
    </r:OtherMaterialScheme>
    <a:OrganizationScheme>
      <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.OrganizationScheme-0:2.0</r:URN>
      <a:Individual>
        <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Individual-0:2.0</r:URN>
        <a:IndividualIdentification>
          <a:IndividualName>
            <a:FirstGiven>Jonas</a:FirstGiven>
            <a:LastFamily>Ravn</a:LastFamily>
            <a:FullName>
              <r:String>Jonas Ravn</r:String>
            </a:FullName>
          </a:IndividualName>
          <a:ResearcherID>
            <a:TypeOfID>ORCID</a:TypeOfID>
            <a:ResearcherIdentification>0000-0003-4328-2530</a:ResearcherIdentification>
          </a:ResearcherID>
        </a:IndividualIdentification>
      </a:Individual>
      <a:Individual>
        <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Individual-0:2.0</r:URN>
        <a:IndividualIdentification>
          <a:IndividualName>
            <a:FirstGiven>Amanda</a:FirstGiven>
            <a:LastFamily>Sörensen Ristinmaa</a:LastFamily>
            <a:FullName>
              <r:String>Amanda Sörensen Ristinmaa</r:String>
            </a:FullName>
          </a:IndividualName>
          <a:ResearcherID>
            <a:TypeOfID>ORCID</a:TypeOfID>
            <a:ResearcherIdentification>0000-0003-0120-0330</a:ResearcherIdentification>
          </a:ResearcherID>
        </a:IndividualIdentification>
      </a:Individual>
      <a:Individual>
        <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Individual-0:2.0</r:URN>
        <a:IndividualIdentification>
          <a:IndividualName>
            <a:FirstGiven>Scott</a:FirstGiven>
            <a:LastFamily>Mazurkewich</a:LastFamily>
            <a:FullName>
              <r:String>Scott Mazurkewich</r:String>
            </a:FullName>
          </a:IndividualName>
          <a:ResearcherID>
            <a:TypeOfID>ORCID</a:TypeOfID>
            <a:ResearcherIdentification>0000-0002-9238-0615</a:ResearcherIdentification>
          </a:ResearcherID>
        </a:IndividualIdentification>
      </a:Individual>
      <a:Individual>
        <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Individual-0:2.0</r:URN>
        <r:UserAttributePair>
          <r:AttributeKey>affiliation</r:AttributeKey>
          <r:AttributeValue>Science for Life Laboratory</r:AttributeValue>
        </r:UserAttributePair>
        <a:IndividualIdentification>
          <a:IndividualName>
            <a:FirstGiven>Guilherme</a:FirstGiven>
            <a:LastFamily>Borges Dias</a:LastFamily>
            <a:FullName>
              <r:String>Guilherme Borges Dias</r:String>
            </a:FullName>
          </a:IndividualName>
          <a:ResearcherID>
            <a:TypeOfID>ORCID</a:TypeOfID>
            <a:ResearcherIdentification>0000-0002-1459-3148</a:ResearcherIdentification>
          </a:ResearcherID>
        </a:IndividualIdentification>
      </a:Individual>
      <a:Individual>
        <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Individual-0:2.0</r:URN>
        <a:IndividualIdentification>
          <a:IndividualName>
            <a:FirstGiven>Johan</a:FirstGiven>
            <a:LastFamily>Larsbrink</a:LastFamily>
            <a:FullName>
              <r:String>Johan Larsbrink</r:String>
            </a:FullName>
          </a:IndividualName>
          <a:ResearcherID>
            <a:TypeOfID>ORCID</a:TypeOfID>
            <a:ResearcherIdentification>0000-0001-8386-2914</a:ResearcherIdentification>
          </a:ResearcherID>
        </a:IndividualIdentification>
      </a:Individual>
      <a:Individual>
        <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Individual-0:2.0</r:URN>
        <a:IndividualIdentification>
          <a:IndividualName>
            <a:FirstGiven>Cecilia</a:FirstGiven>
            <a:LastFamily>Geijer</a:LastFamily>
            <a:FullName>
              <r:String>Cecilia Geijer</r:String>
            </a:FullName>
          </a:IndividualName>
          <a:ResearcherID>
            <a:TypeOfID>ORCID</a:TypeOfID>
            <a:ResearcherIdentification>0000-0002-4158-2938</a:ResearcherIdentification>
          </a:ResearcherID>
        </a:IndividualIdentification>
      </a:Individual>
    </a:OrganizationScheme>
  </g:ResourcePackage>
  <s:StudyUnit>
    <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.StudyUnit:2.0</r:URN>
    <r:UserID typeOfUserID="datasetIdentifier">doi-10-17044-scilifelab-28606814</r:UserID>
    <r:Citation>
      <r:Title>
        <r:String xml:lang="en">Gene annotation of Blastobotrys mokoenaii, Blastobotrys illinoisensis, and Blastobotrys malaysiensis</r:String>
      </r:Title>
      <r:Creator>
        <r:CreatorReference>
          <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Individual-0:2.0</r:URN>
          <r:TypeOfObject>Individual</r:TypeOfObject>
        </r:CreatorReference>
      </r:Creator>
      <r:Publisher>
        <r:PublisherName>
          <r:String xml:lang="sv">Chalmers tekniska högskola</r:String>
          <r:String xml:lang="en">Chalmers University of Technology</r:String>
        </r:PublisherName>
      </r:Publisher>
      <r:Publisher>
        <r:PublisherName>
          <r:String xml:lang="sv">Chalmers tekniska högskola</r:String>
          <r:String xml:lang="en">Chalmers University of Technology</r:String>
        </r:PublisherName>
      </r:Publisher>
      <r:PublicationDate>
        <r:SimpleDate>2025-03-21</r:SimpleDate>
      </r:PublicationDate>
      <r:InternationalIdentifier>
        <r:IdentifierContent>10.17044/SCILIFELAB.28606814</r:IdentifierContent>
        <r:ManagingAgency controlledVocabularyAgencyName="DOI">DOI</r:ManagingAgency>
      </r:InternationalIdentifier>
    </r:Citation>
    <r:Abstract>
      <r:Content xml:lang="en">This dataset contains the gene annotation data for three species of Blastobotrys yeats: B. mokoenaii, B. illinoisensis, and B. malaysiensis.

The genome assemblies for B. mokoenaii (NRRL Y-27120) and B. malaysiensis (NRRL Y-6417) were publicly available on the National Center for Biotechnology Information (NCBI) under accessions GCA_003705765.3 and GCA_030558815.1, respectively.

The genome assembly for B. illinoisensis (NRRL YB-1343) was generated by SciLifeLab's National Genomics Infrastructure (NGI) using PacBio long-read data and deposited in the European Nucleotide Archive (ENA) under accession GCA_965113335.1.

File description- bmokoenaii_annotation.gff
This file contains the gene models predicted for B. mokoenaii (GCA_003705765.3).
- billinoisensis_annotation.gff
This file contains the gene models predicted for B. illinoisensis (GCA_003705765.3).
- bmalaysiensis_annotation.gff
This file contains the gene models predicted for B. malaysiensis (GCA_030558815.1).
Gene annotation methodsRepeat MaskingPrior to annotation, a repeat library was built for each species using RepeatModeler2 v2.0.2 and the genomes were soft-masked using RepeatMasker v4.1.5.


$ RepeatModeler -database ${DB} -engine ncbi -pa 16
$ RepeatMasker -dir . -gff -u -no_is -xsmall -e ncbi -lib ${LIBRARY} -pa 16 genome.fasta

Structural Annotation
Structural annotation was performed on the soft-masked genomes using Braker3 v3.0.3 incorporating external evidence in the form of all fungal proteins from OrthoDB v11 (available at https://bioinf.uni-greifswald.de/bioinf/partitioned_odb11).


$ braker.pl --genome="$genome" \

--prot_seq=${protein} --workingdir=${PWD} \
--gff3 --threads=16 --verbosity=3 \
--nocleanup --species=${i}

Functional Annotation

The predicted genes were functionally annotated using the National Bioiformatics Infrastructure Sweden (NBIS) functional_annotation nextflow pipeline v2.0.0 (https://github.com/NBISweden/pipelines-nextflow). Briefly, this pipeline performs similarity searches between the annotated proteins and the UniProtKB/Swiss-Prot database (downloaded on 2023-12) using the Basic Local Alignment Search Tool (BLAST). Then it uses InterProScan to query the proteins against InterPro v59-91 databases, and merges results using AGAT v1.2.0.

tRNAs and rRNAs

Transfer RNA (tRNA) and ribosomal RNA (rRNA) genes were annotated using tRNAscan-SE v2.0.12 and barrnap v0.9, respectively. Other ncRNAs, such as SRP RNA, RNase P RNA, spliceosomal ncRNAs etc. have not been predicted. Finnally, the functionally annotated protein-coding genes, tRNAs, and rRNAs were combined into a single GFF file using AGAT v1.2.0.

$ tRNAscan-SE -E --gff ${output}_trnas.gff --thread 16 ${genome}.fasta
$ barrnap --kingdom euk --threads 6 ${genome}.fasta &gt; ${output}_rrna.gff

Annotation integrationFinnally, the functionally annotated protein-coding genes, tRNAs, and rRNAs were combined into a single GFF file using AGAT v1.2.0.

$ agat_sp_complement_annotations.pl --ref ${protein_coding} --add ${trna} --add ${rrna} --out full_annotation.gff</r:Content>
    </r:Abstract>
    <r:Coverage>
      <r:TopicalCoverage>
        <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.TopicalCoverage:2.0</r:URN>
        <r:Subject xml:lang="en" controlledVocabularyID="3" controlledVocabularyName="Standard för svensk indelning av forskningsämnen 2025">Medical and Health Sciences</r:Subject>
        <r:Subject xml:lang="sv" controlledVocabularyID="3" controlledVocabularyName="Standard för svensk indelning av forskningsämnen 2025">Medicin och hälsovetenskap</r:Subject>
        <r:Subject xml:lang="en" controlledVocabularyID="10609" controlledVocabularyName="Standard för svensk indelning av forskningsämnen 2025">Genetics and Genomics</r:Subject>
        <r:Subject xml:lang="sv" controlledVocabularyID="10609" controlledVocabularyName="Standard för svensk indelning av forskningsämnen 2025">Genetik och genomik</r:Subject>
        <r:Subject xml:lang="en" controlledVocabularyID="106" controlledVocabularyName="Standard för svensk indelning av forskningsämnen 2025">Biological Sciences</r:Subject>
        <r:Subject xml:lang="sv" controlledVocabularyID="106" controlledVocabularyName="Standard för svensk indelning av forskningsämnen 2025">Biologi</r:Subject>
      </r:TopicalCoverage>
      <r:SpatialCoverage />
    </r:Coverage>
    <a:Archive>
      <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Archive:2.0</r:URN>
      <a:ArchiveSpecific>
        <a:Item>
          <a:Access>
            <r:URN>urn:ddi:se.researchdata:doi-10-17044-scilifelab-28606814.Archive-ArchiveSpecificType-AccessType:2.0</r:URN>
            <a:TypeOfAccess controlledVocabularyName="info:eu-repo-Access-Terms vocabulary"></a:TypeOfAccess>
          </a:Access>
          <a:DataFileQuantity>0</a:DataFileQuantity>
        </a:Item>
      </a:ArchiveSpecific>
    </a:Archive>
  </s:StudyUnit>
</ddi:DDIInstance>