{"@context":{"@language":"en","@vocab":"https://schema.org/","citeAs":"cr:citeAs","column":"cr:column","conformsTo":"dct:conformsTo","cr":"http://mlcommons.org/croissant/","rai":"http://mlcommons.org/croissant/RAI/","data":{"@id":"cr:data","@type":"@json"},"dataType":{"@id":"cr:dataType","@type":"@vocab"},"dct":"http://purl.org/dc/terms/","examples":{"@id":"cr:examples","@type":"@json"},"extract":"cr:extract","field":"cr:field","fileProperty":"cr:fileProperty","fileObject":"cr:fileObject","fileSet":"cr:fileSet","format":"cr:format","includes":"cr:includes","isLiveDataset":"cr:isLiveDataset","jsonPath":"cr:jsonPath","key":"cr:key","md5":"cr:md5","parentField":"cr:parentField","path":"cr:path","recordSet":"cr:recordSet","references":"cr:references","regex":"cr:regex","repeated":"cr:repeated","replace":"cr:replace","sc":"https://schema.org/","separator":"cr:separator","source":"cr:source","subField":"cr:subField","transform":"cr:transform","wd":"https://www.wikidata.org/wiki/"},"@type":"sc:Dataset","conformsTo":"http://mlcommons.org/croissant/1.0","name":"Image-Guided Object Detection using OWL-ViTand Enhanced Query Embedding Extraction","url":"https://doi.org/10.7910/DVN/PRHQMK","creator":[{"@type":"Person","givenName":"Melih","familyName":"Serin","affiliation":{"@type":"Organization","name":"Boğaziçi University"},"name":"Melih Serin"}],"description":"Computer vision has been receiving increasing attention with the recent complex Generative AI models released by tech industry giants, such as OpenAI and Google. However, there is a specific subfield that we wanted to focus on, that is, Image-Guided Object Detection. A detailed literature survey directed us towards a successful study called Simple Open-Vocabulary Object Detection with Vision Transformers (OWL-ViT) [1], which is a multifunctional complex model that can also perform image-guided object detection as a side function. In this study, some experiments have been conducted utilizing OWL-ViT architecture as the base model and manipulated the necessary parts to achieve a better one-shot performance. Code and models are available on GitHub.","keywords":["Engineering","Open-Vocabulary Object Detection with Vision Transformers (OWL-ViT)","Object Detection","Vision Transformers","End-to-End Training","Generalized Intersection over Union (gIoU) Loss"],"license":"http://creativecommons.org/publicdomain/zero/1.0","datePublished":"2024-04-14","dateModified":"2024-04-14","includedInDataCatalog":{"@type":"DataCatalog","name":"Harvard Dataverse","url":"https://dataverse.harvard.edu"},"publisher":{"@type":"Organization","name":"Harvard Dataverse"},"version":"1.0","citeAs":"@data{DVN/PRHQMK_2024,author = {Melih Serin},publisher = {Harvard Dataverse},title = {Image-Guided Object Detection using OWL-ViTand Enhanced Query Embedding Extraction},year = {2024},url = {https://doi.org/10.7910/DVN/PRHQMK}}","distribution":[{"@type":"cr:FileObject","@id":"ImageGuidedObjectDetection.pdf","name":"ImageGuidedObjectDetection.pdf","encodingFormat":"application/pdf","md5":"3d43488c06f591c6c2a832dc82181d05","contentSize":"4173342","description":"","contentUrl":"https://dataverse.harvard.edu/api/access/datafile/10117853"}]}