Skip to content

Commit

Permalink
Update to use new models and new client approach (#1579)
Browse files Browse the repository at this point in the history
  • Loading branch information
lspacagna-oai authored Nov 27, 2024
1 parent 4e9bc47 commit fa92636
Show file tree
Hide file tree
Showing 4 changed files with 102,381 additions and 249 deletions.
84 changes: 34 additions & 50 deletions examples/Embedding_Wikipedia_articles_for_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,20 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# imports\n",
"import mwclient # for downloading example Wikipedia articles\n",
"import mwparserfromhell # for splitting Wikipedia articles into sections\n",
"import openai # for generating embeddings\n",
"from openai import OpenAI # for generating embeddings\n",
"import os # for environment variables\n",
"import pandas as pd # for DataFrames to store article sections and embeddings\n",
"import re # for cutting <ref> links out of Wikipedia articles\n",
"import tiktoken # for counting tokens\n",
"\n",
"client = openai.OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\", \"<your OpenAI API key if not set as env var>\"))"
"client = OpenAI(api_key=os.environ.get(\"OPENAI_API_KEY\", \"<your OpenAI API key if not set as env var>\"))"
]
},
{
Expand Down Expand Up @@ -84,14 +84,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 731 article titles in Category:2022 Winter Olympics.\n"
"Found 179 article titles in Category:2022 Winter Olympics.\n"
]
}
],
Expand Down Expand Up @@ -145,7 +145,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -230,14 +230,14 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found 5730 sections in 731 pages.\n"
"Found 1838 sections in 179 pages.\n"
]
}
],
Expand All @@ -252,14 +252,14 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Filtered out 530 sections, leaving 5200 sections.\n"
"Filtered out 89 sections, leaving 1749 sections.\n"
]
}
],
Expand Down Expand Up @@ -296,20 +296,20 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Lviv bid for the 2022 Winter Olympics']\n"
"['Concerns and controversies at the 2022 Winter Olympics']\n"
]
},
{
"data": {
"text/plain": [
"'{{Olympic bid|2022|Winter|\\n| Paralympics = yes\\n| logo = Lviv 2022 Winter Olym...'"
"'{{Short description|Overview of concerns and controversies surrounding the Ga...'"
]
},
"metadata": {},
Expand All @@ -320,13 +320,13 @@
"output_type": "stream",
"text": [
"\n",
"['Lviv bid for the 2022 Winter Olympics', '==History==']\n"
"['Concerns and controversies at the 2022 Winter Olympics', '==Criticism of host selection==']\n"
]
},
{
"data": {
"text/plain": [
"'[[Image:Lwów - Rynek 01.JPG|thumb|right|200px|View of Rynok Square in Lviv]]\\n...'"
"'American sportscaster [[Bob Costas]] criticized the [[International Olympic C...'"
]
},
"metadata": {},
Expand All @@ -337,13 +337,13 @@
"output_type": "stream",
"text": [
"\n",
"['Lviv bid for the 2022 Winter Olympics', '==Venues==']\n"
"['Concerns and controversies at the 2022 Winter Olympics', '==Organizing concerns and controversies==', '===Cost and climate===']\n"
]
},
{
"data": {
"text/plain": [
"'{{Location map+\\n|Ukraine\\n|border =\\n|caption = Venue areas\\n|float = left\\n|widt...'"
"'Several cities withdrew their applications during [[Bids for the 2022 Winter ...'"
]
},
"metadata": {},
Expand All @@ -354,13 +354,13 @@
"output_type": "stream",
"text": [
"\n",
"['Lviv bid for the 2022 Winter Olympics', '==Venues==', '===City zone===']\n"
"['Concerns and controversies at the 2022 Winter Olympics', '==Organizing concerns and controversies==', '===Promotional song===']\n"
]
},
{
"data": {
"text/plain": [
"'The main Olympic Park would be centered around the [[Arena Lviv]], hosting th...'"
"'Some commentators alleged that one of the early promotional songs for the [[2...'"
]
},
"metadata": {},
Expand All @@ -371,13 +371,13 @@
"output_type": "stream",
"text": [
"\n",
"['Lviv bid for the 2022 Winter Olympics', '==Venues==', '===Mountain zone===', '====Venue cluster Tysovets-Panasivka====']\n"
"['Concerns and controversies at the 2022 Winter Olympics', '== Diplomatic boycotts or non-attendance ==']\n"
]
},
{
"data": {
"text/plain": [
"'An existing military ski training facility in [[Tysovets, Skole Raion|Tysovet...'"
"'<section begin=boycotts />\\n[[File:2022 Winter Olympics (Beijing) diplomatic b...'"
]
},
"metadata": {},
Expand Down Expand Up @@ -419,11 +419,11 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"GPT_MODEL = \"gpt-3.5-turbo\" # only matters insofar as it selects which tokenizer to use\n",
"GPT_MODEL = \"gpt-4o-mini\" # only matters insofar as it selects which tokenizer to use\n",
"\n",
"\n",
"def num_tokens(text: str, model: str = GPT_MODEL) -> int:\n",
Expand Down Expand Up @@ -517,14 +517,14 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5200 Wikipedia sections split into 6059 strings.\n"
"1749 Wikipedia sections split into 2052 strings.\n"
]
}
],
Expand All @@ -540,32 +540,20 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Lviv bid for the 2022 Winter Olympics\n",
"\n",
"==History==\n",
"\n",
"[[Image:Lwów - Rynek 01.JPG|thumb|right|200px|View of Rynok Square in Lviv]]\n",
"\n",
"On 27 May 2010, [[President of Ukraine]] [[Viktor Yanukovych]] stated during a visit to [[Lviv]] that Ukraine \"will start working on the official nomination of our country as the holder of the Winter Olympic Games in [[Carpathian Mountains|Carpathians]]\".\n",
"Concerns and controversies at the 2022 Winter Olympics\n",
"\n",
"In September 2012, [[government of Ukraine]] approved a document about the technical-economic substantiation of the national project \"Olympic Hope 2022\". This was announced by Vladyslav Kaskiv, the head of Ukraine´s Derzhinvestproekt (State investment project). The organizers announced on their website venue plans featuring Lviv as the host city and location for the \"ice sport\" venues, [[Volovets]] (around {{convert|185|km|mi|abbr=on}} from Lviv) as venue for the [[Alpine skiing]] competitions and [[Tysovets, Skole Raion|Tysovets]] (around {{convert|130|km|mi|abbr=on}} from Lviv) as venue for all other \"snow sport\" competitions. By March 2013 no other preparations than the feasibility study had been approved.\n",
"==Criticism of host selection==\n",
"\n",
"On 24 October 2013, session of the Lviv City Council adopted a resolution \"About submission to the International Olympic Committee for nomination of city to participate in the procedure for determining the host city of Olympic and Paralympic Winter Games in 2022\".\n",
"American sportscaster [[Bob Costas]] criticized the [[International Olympic Committee]]'s (IOC) decision to award the games to China saying \"The IOC deserves all of the disdain and disgust that comes their way for going back to China yet again\" referencing China's human rights record.\n",
"\n",
"On 5 November 2013, it was confirmed that Lviv was bidding to host the [[2022 Winter Olympics]]. Lviv would host the ice sport events, while the skiing events would be held in the [[Carpathian]] mountains. This was the first bid Ukraine had ever submitted for an Olympic Games.\n",
"\n",
"On 30 June 2014, the International Olympic Committee announced \"Lviv will turn its attention to an Olympic bid for 2026, and not continue with its application for 2022. The decision comes as a result of the present political and economic circumstances in Ukraine.\"\n",
"\n",
"Ukraine's Deputy Prime Minister Oleksandr Vilkul said that the Winter Games \"will be an impetus not just for promotion of sports and tourism in Ukraine, but a very important component in the economic development of Ukraine, the attraction of the investments, the creation of new jobs, opening Ukraine to the world, returning Ukrainians working abroad to their motherland.\"\n",
"\n",
"Lviv was one of the host cities of [[UEFA Euro 2012]].\n"
"After winning two gold medals and returning to his home country of Sweden skater [[Nils van der Poel]] criticized the IOC's selection of China as the host saying \"I think it is extremely irresponsible to give it to a country that violates human rights as blatantly as the Chinese regime is doing.\" He had declined to criticize China before leaving for the games saying \"I don't think it would be particularly wise for me to criticize the system I'm about to transition to, if I want to live a long and productive life.\"\n"
]
}
],
Expand All @@ -588,7 +576,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 13,
"metadata": {},
"outputs": [
{
Expand All @@ -597,11 +585,7 @@
"text": [
"Batch 0 to 999\n",
"Batch 1000 to 1999\n",
"Batch 2000 to 2999\n",
"Batch 3000 to 3999\n",
"Batch 4000 to 4999\n",
"Batch 5000 to 5999\n",
"Batch 6000 to 6999\n"
"Batch 2000 to 2999\n"
]
}
],
Expand Down Expand Up @@ -637,7 +621,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -665,7 +649,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.9"
"version": "3.12.6"
},
"orig_nbformat": 4
},
Expand Down
Loading

0 comments on commit fa92636

Please sign in to comment.