<?xml version="1.0" encoding="UTF-8"?>
<resource xmlns="http://datacite.org/schema/kernel-4" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.5/metadata.xsd">
  <identifier identifierType="DOI">10.7910/DVN/PRHQMK</identifier>
  <creators>
    <creator>
      <creatorName nameType="Personal">Melih Serin</creatorName>
      <givenName>Melih</givenName>
      <familyName>Serin</familyName>
      <affiliation>Boğaziçi University</affiliation>
    </creator>
  </creators>
  <titles>
    <title>Image-Guided Object Detection using OWL-ViTand Enhanced Query Embedding Extraction</title>
  </titles>
  <publisher>Harvard Dataverse</publisher>
  <publicationYear>2024</publicationYear>
  <subjects>
    <subject>Engineering</subject>
    <subject>Open-Vocabulary Object Detection with Vision Transformers (OWL-ViT)</subject>
    <subject>Object Detection</subject>
    <subject>Vision Transformers</subject>
    <subject>End-to-End Training</subject>
    <subject>Generalized Intersection over Union (gIoU) Loss</subject>
  </subjects>
  <contributors>
    <contributor contributorType="ContactPerson">
      <contributorName nameType="Personal">Melih Serin</contributorName>
      <givenName>Melih</givenName>
      <familyName>Serin</familyName>
      <affiliation>Boğaziçi University</affiliation>
    </contributor>
  </contributors>
  <dates>
    <date dateType="Submitted">2024-04-14</date>
    <date dateType="Available">2024-04-14</date>
  </dates>
  <resourceType resourceTypeGeneral="Dataset"/>
  <sizes>
    <size>4173342</size>
  </sizes>
  <formats>
    <format>application/pdf</format>
  </formats>
  <version>1.0</version>
  <rightsList>
    <rights rightsURI="info:eu-repo/semantics/openAccess"/>
    <rights rightsURI="http://creativecommons.org/publicdomain/zero/1.0" rightsIdentifier="CC0-1.0" rightsIdentifierScheme="SPDX" schemeURI="https://spdx.org/licenses/" xml:lang="en">Creative Commons CC0 1.0 Universal Public Domain Dedication.</rights>
  </rightsList>
  <descriptions>
    <description descriptionType="Abstract">Computer vision has been receiving increasing attention with the recent complex Generative AI models released by tech industry giants, such as OpenAI and Google. However, there is a specific subfield that we wanted to focus on, that is, Image-Guided Object Detection. A detailed literature survey directed us towards a successful study called Simple Open-Vocabulary Object Detection with Vision Transformers (OWL-ViT) [1], which is a multifunctional complex model that can also perform image-guided object detection as a side function. In this study, some experiments have been conducted utilizing OWL-ViT architecture as the base model and manipulated the necessary parts to achieve a better one-shot performance. Code and models are available on GitHub.</description>
  </descriptions>
</resource>
