{"dcterms:modified":"2025-04-01","dcterms:creator":"Harvard Dataverse","@type":"ore:ResourceMap","schema:additionalType":"Dataverse OREMap Format v1.0.1","dvcore:generatedBy":{"@type":"schema:SoftwareApplication","schema:name":"Dataverse","schema:version":"6.6 build 1829-192cdc4","schema:url":"https://github.com/iqss/dataverse"},"@id":"https://dataverse.harvard.edu/api/datasets/export?exporter=OAI_ORE&persistentId=https://doi.org/10.7910/DVN/BZ1RWS","ore:describes":{"citation:datasetContact":{"citation:datasetContactName":"Rozado, David","citation:datasetContactAffiliation":"Otago Polytechnic","citation:datasetContactEmail":"drozado@gmail.com"},"citation:keyword":{"citation:keywordValue":"word2vec Model, word embeddings"},"citation:dsDescription":{"citation:dsDescriptionValue":"word2vec model trained on the concatenation of all the individual universities corpora. To generate the word embeddings of the corpus, the gensim implementation of word2vec (CBOW) was used. For training the word embeddings model, the following parameters were used: vector dimensions=300, window size=10, negative sampling=10, down sampling frequent words = 0.00008 (downsamples 612 most-common words), number of iterations (epochs) through the corpus=10, maximum final vocabulary= 3 million. The maximum final vocabulary resulted in an effective minimum frequency count of 20. That is, only terms that appear more than 20 times in the corpus were included into the word embedding model vocabulary. The exponent used to shape the negative sampling distribution was 0.5.","citation:dsDescriptionDate":"2019"},"author":{"citation:authorName":"Rozado, David","citation:authorAffiliation":"CSIRO","authorIdentifierScheme":"ORCID","authorIdentifier":"https://orcid.org/0000-0001-6849-4746"},"subject":["Computer and Information Science","Social Sciences"],"title":"word2vec model trained on universities corpus","dateOfDeposit":"2019-06-10","citation:depositor":"Rozado, David","@id":"https://doi.org/10.7910/DVN/BZ1RWS","@type":["ore:Aggregation","schema:Dataset"],"schema:version":"1.0","schema:name":"word2vec model trained on universities corpus","schema:dateModified":"Mon Jun 10 21:25:53 UTC 2019","schema:datePublished":"2019-06-10","schema:creativeWorkStatus":"RELEASED","schema:license":"http://creativecommons.org/publicdomain/zero/1.0","dvcore:fileTermsOfAccess":{"dvcore:fileRequestAccess":false},"schema:includedInDataCatalog":"Harvard Dataverse","schema:isPartOf":{"schema:name":"Universities corpus","@id":"https://dataverse.harvard.edu/dataverse/universities-corpus","schema:description":"Between April and October 2018, the Internet domains of 50 elite research universities in the US were scraped using the Scrapy framework for automated crawling and extraction of data from websites. The list of universities scraped was taken from the top 50 entries in the US News University Ranking Charts of 2017. \r\n\r\nThe scraping process started at the base URL of each University domain and proceeded to extract textual elements contained in a URL endpoint and to follow all the detected links pointing within the University domain to continue collecting textual elements up to a predefined depth level. A depth first crawling algorithm for visiting scraped links was followed for memory efficiency reasons. In total, a text corpus of size 42 GB for the 50 universities combined was retrieved.\r\n","schema:isPartOf":{"schema:name":"Harvard Dataverse","@id":"https://dataverse.harvard.edu/dataverse/harvard","schema:description":"<span><span><span><h3>Share, archive, and get credit for your data. Find and cite data across all research fields.</h3></span></span></span>"}},"ore:aggregates":[{"schema:name":"word2vecModelsTrigrams.part1.rar","dvcore:restricted":false,"schema:version":1,"dvcore:datasetVersionId":158730,"@id":"doi:10.7910/DVN/BZ1RWS/FDSELM","schema:sameAs":"https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/BZ1RWS/FDSELM","@type":"ore:AggregatedResource","schema:fileFormat":"application/x-rar-compressed","dvcore:filesize":2147483648,"dvcore:storageIdentifier":"s3://dvn-cloud:16b40ac440b-283160fb183b","dvcore:rootDataFileId":-1,"dvcore:checksum":{"@type":"MD5","@value":"a8f9ab6a24b5df35ed232ab667ffd21c"}},{"schema:name":"word2vecModelsTrigrams.part2.rar","dvcore:restricted":false,"schema:version":1,"dvcore:datasetVersionId":158730,"@id":"doi:10.7910/DVN/BZ1RWS/ZOAHOI","schema:sameAs":"https://dataverse.harvard.edu/api/access/datafile/:persistentId?persistentId=doi:10.7910/DVN/BZ1RWS/ZOAHOI","@type":"ore:AggregatedResource","schema:fileFormat":"application/x-rar-compressed","dvcore:filesize":1145831825,"dvcore:storageIdentifier":"s3://dvn-cloud:16b40b9caaa-c53caf4c3a01","dvcore:rootDataFileId":-1,"dvcore:checksum":{"@type":"MD5","@value":"8fca862a8c4d42ac22096d32ce25fe58"}}],"schema:hasPart":["doi:10.7910/DVN/BZ1RWS/FDSELM","doi:10.7910/DVN/BZ1RWS/ZOAHOI"]},"@context":{"author":"http://purl.org/dc/terms/creator","authorIdentifier":"http://purl.org/spar/datacite/AgentIdentifier","authorIdentifierScheme":"http://purl.org/spar/datacite/AgentIdentifierScheme","citation":"https://dataverse.org/schema/citation/","dateOfDeposit":"http://purl.org/dc/terms/dateSubmitted","dcterms":"http://purl.org/dc/terms/","dvcore":"https://dataverse.org/schema/core#","ore":"http://www.openarchives.org/ore/terms/","schema":"http://schema.org/","subject":"http://purl.org/dc/terms/subject","title":"http://purl.org/dc/terms/title"}}