fuck cohere

gecover · Nov 28, 2023 · 35482b2 · 35482b2
1 parent 4b19e98
commit 35482b2
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 232 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 venv
 .env.dev
-**/__pycache__/**
+**/__pycache__/**
+.env.local
diff --git a/app/main.py b/app/main.py
@@ -1,7 +1,9 @@
 from typing import Union, Annotated, List
 import os
 import numpy as np
-from llmsherpa.readers import LayoutPDFReader
+import json
+import re
+import cohere
 from fastapi import FastAPI, File, UploadFile, Request, Depends
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.security import OAuth2PasswordBearer
@@ -10,31 +12,25 @@
 from dotenv import find_dotenv
 from dotenv import load_dotenv
 from pydantic import BaseModel
-from app.scrape.ScrapeClient import ScrapingClient
-from app.cohere.CohereClient import CohereClient
+import io
+from openai import OpenAI
 
-env_file = find_dotenv(".env.dev")
-load_dotenv(env_file)
-
-COHERE_API_KEY = os.getenv("CO_API_KEY")
-SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
-SUPABASE_API_KEY: str = os.environ.get("SUPABASE_KEY")
-SCRAPE_CLIENT_API_KEY = os.environ.get("SCRAPE_API_KEY")
-
-co = CohereClient(COHERE_API_KEY)
-supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
-scrape_client = ScrapingClient(scrapeops_api_key=SCRAPE_CLIENT_API_KEY, num_concurrent_threads=5)
-
-# llm sherpa for reading pdfs
-llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
-pdf_reader = LayoutPDFReader(llmsherpa_api_url)
 
 class URLPayload(BaseModel):
     url: str
 
 class ModelPayload(BaseModel):
     model: str
-
+
+env_file = find_dotenv(".env.dev")
+load_dotenv(env_file)
+
+url: str = os.environ.get("SUPABASE_URL")
+key: str = os.environ.get("SUPABASE_KEY")
+supabase: Client = create_client(url, key)
+
+client = OpenAI()
+
 app = FastAPI()
 
 oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
@@ -54,7 +50,7 @@ class ModelPayload(BaseModel):
 
 @app.get("/")
 def read_root():
-    return {"Hello": "World"}
+    return {"Hello": "GeCover"}
 
 @app.post("/extract_url/")
 async def extract_url(payload: URLPayload, token: Annotated[str, Depends(oauth2_scheme)]): #, token: Annotated[str, Depends(oauth2_scheme)]
@@ -70,106 +66,65 @@ async def extract_url(payload: URLPayload, token: Annotated[str, Depends(oauth2_
     else:
         return {"error" : "Unable to extract job data from URL!"}
 
-@app.post("/read_pdf/")
-async def read_pdf(file: Annotated[bytes, File()], token: Annotated[str, Depends(oauth2_scheme)]):
-
-    # get user data from JWT
+@app.post("/generate_paragraphs/")
+def generate_paragraphs(file: Annotated[bytes, File()], requirements: str, token: Annotated[str, Depends(oauth2_scheme)]):#, token: Annotated[str, Depends(oauth2_scheme)]
+    # # get user data from JWT
     data = supabase.auth.get_user(token)
-
-    # assert that the user is authenticated.
+    # # assert that the user is authenticated.
     assert data.user.aud == 'authenticated', "402: not authenticated."
 
-    # the path_or_url is fake, ignored when contents is set.
     try:
-        content = pdf_reader.read_pdf(path_or_url="https://someexapmple.com/myfile.pdf", contents=file)
+        # content = client.detect_document_text(Document={'Bytes': file})
+        file = openai.files.create(
+            file=file,
+            purpose="assistants"
+        )
     except:
-        # very mid error handling
-        return {"contents" : []}
+        return {"id" : None}
 
-    docs = []
-    for section in content.sections():
-        docs.append(section.to_text(include_children=True, recurse=True))
-
-    return {"contents": docs }
+    # input_credentials = ("\n - ").join(requirements)
 
-@app.post("/generate_paragraphs/")
-def generate_paragraphs(requirements: List[str], resume_documents: List[str], model: ModelPayload, token: Annotated[str, Depends(oauth2_scheme)]):#, token: Annotated[str, Depends(oauth2_scheme)]
-    # # get user data from JWT
-    data = supabase.auth.get_user(token)
-    # # assert that the user is authenticated.
-    assert data.user.aud == 'authenticated', "402: not authenticated."
 
-    documents = []
-
-    for doc in resume_documents:
-        documents.append({"snippet" : doc})
-
-    queries = []
-
-    for i, req in enumerate(requirements):
-        query = f""" 
-        You are acting as a personal professional writer.
-        Explain in two sentences about how I satisfy the following job requirement written in the first person:
-        note: Do not act as a chat bot. Do not preface the response with "sure, here is that summary:". 
-        note: Do not finish the paragraph with anything like  "anything else I can help with?" or "is there anything else you would like to know?".
-        note: If you don't have the information, do not output things like [Company name] or [first name] placeholders.
-        Reference the documents provided that contain information about me. Be positive and enthusiastic!
-
-        Job requirement:
-        {req}
-
-        Summary of why I satisfy the job requirement in 2-3 sentences:
-        """
-        queries.append(query)
-    #print('QUERIES: ', queries)
-
-    with ThreadPoolExecutor(max_workers=len(requirements)) as executor:
-        futures = [executor.submit(co.chat, query, documents=documents) for query in queries]
-        responses = [future.result().text for future in futures]
-
-    #Encode your documents with input type 'search_document'
-    doc_emb = co.embed(responses)
-    doc_emb = np.asarray(doc_emb)
-
-    query = """ The most important job requirement to satisfy."""
-
-    rerank_hits = co.rerank(query=query, documents=responses, top_n=min(len(documents), 5))
-
-    rerank_results = [x.document['text'] for x in rerank_hits]
-
-    # reverse it, because LLMs have a recency bias
-    rerank_results.reverse()
-
-    # rerank_hits.reverse()
-    input_credentials = ("\n - ").join(rerank_results)
-
-    if model == 'altman':
-        para_one_prompt = f"""
-        You are acting as a personal professional writer.
-        note: DO NOT prompt the user as a chat bot. Don't repeat skills once you have said them. 
-        note: Make it a maximum of two paragraphs.  
-        note:  Please only output the paragraph. Do not preface the paragraphs with "sure, here are those paragraphs:". Do not finish the paragraph with anything like  "anything else I can help with?".
-        note: if you don't have the information, do not output tokens like "[Company name]" or "[first name]" as placeholders.
-        Write in first person, and be positive and enthusiastic!
-        
-        The points to summarize:
-        - {input_credentials}
-
-        First person summary:   
-        """
-        result = co.generate(model='e1f1b8c8-f87a-4fd3-9346-99068e5b7036-ft', prompt=para_one_prompt, k=25, temperature=0.96, frequency_penalty=0.2, num_generations=1) 
-        return {'para_A' : result.data[0], 'para_B' : 'result.data[1]'}    
+    prompt = f"""
+    - {requirements}
 
-    else:
-        response  = co.summarize(
-            text=input_credentials,
-            format="paragraph",
-            temperature=0.96,
-            length='long',
-            model='command-nightly',
-            extractiveness='auto',
-            additional_command='Generate a summary of the first person credentials being provided. The summary should maintain the first-person prose. Remove all open-ended questions and placeholder tokens like [company name] or [first name] as examples.'
+    Could you write me a couple paragraphs without an introduction/outro about why I am the right candidate for the job? The document you have access to is my CV.
+    """
+
+    print(prompt)
+
+    thread = openai.beta.threads.create(
+        messages=[
+            {
+            "role": "user",
+            "content": prompt,
+            "file_ids": [file.id]
+            }
+        ]
+    )
+
+    run = openai.beta.threads.runs.create(
+        thread_id=thread.id,
+        assistant_id="asst_C0GRyfBLNOXtrxlOPpA4ouvr",
+    )
+
+    while True:
+        run = openai.beta.threads.runs.retrieve(
+            thread_id=thread.id,
+            run_id=run.id
         )
-        return {'para_A' : response.summary, 'para_B' : 'result.data[1]'}
+        if run.status == 'completed':
+            break
+
+        time.sleep(3)
+
+    messages = openai.beta.threads.messages.list(
+        thread_id=thread.id
+    )
+
+    delete_file = openai.files.delete(file.id)
+    delete_thread = openai.beta.threads.delete(thread.id)
 
-    return {'para_A' : "something broke", 'para_B' : 'result.data[1]'}    
+    # print(messages.data[0].content[0].text.value)
+    # print(delete_thread)
+    return {'para_A' : messages.data[0].content[0].text.value, 'para_B' : 'result.data[1]'}  
diff --git a/app/scrape/ScrapeClient.py b/app/scrape/ScrapeClient.py
@@ -12,6 +12,7 @@
 import logging
 from dotenv import load_dotenv, find_dotenv
 from app.cohere.CohereClient import CohereClient
+from openai import OpenAI
 
 # logging.basicConfig()
 # logging.getLogger().setLevel(logging.DEBUG)
@@ -24,8 +25,7 @@
 if dotenv_path:
     load_dotenv(dotenv_path)
 
-COHERE_API_KEY = os.getenv("CO_API_KEY")
-co = CohereClient(COHERE_API_KEY)
+client = OpenAI()
 
 class ScrapingClient:
 
@@ -110,20 +110,12 @@ def send_request(self, url, method='GET', scrapeops_proxy_settings=None, **kwarg
                 soup = bs.BeautifulSoup(source,'lxml')
                 div = soup.find("div", class_ = "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" )
                 if div:
-                    # summarize with cohere
-                    # tempurature zero for the time being.
-                    # keeping it at zero allows us to better experiment and tweak things, knowing the LLM is a control.
-                    response = co.summarize( 
-                        text=div.get_text(),
-                        length='short',
-                        format='bullets',
-                        model='command',
-                        additional_command='extract the most important qualifications.',
-                        extractiveness='high',
-                        temperature=0.0,
-                    ) 
+                     prompt = f"Please extract the most important job requirements from the following job posting and list them in point form: {div.get_text()}."
+                    completion = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}])
+                    response = completion.choices[0].message.content
+
                     # first element is always ""
-                    clean_response = response.summary.split('- ')[1:]
+                    clean_response = response.split('- ')[1:]
 
                     company = soup.find("a", class_="topcard__org-name-link topcard__flavor--black-link").get_text()
                     pattern = r"(?<=\n)(.*?)(?=\n)"
@@ -175,20 +167,13 @@ def is_qualification_or_requirement_header(tag):
 
                             for qualification in qualifications:
                                 print('qualifcation: ', qualification)
-                            # summarize with cohere
-                            # tempurature zero for the time being.
-                            # keeping it at zero allows us to better experiment and tweak things, knowing the LLM is a control.
-                            qualificationsResponse = co.summarize( 
-                                text=', '.join(qualifications),
-                                length='short',
-                                format='bullets',
-                                model='command',
-                                additional_command='extract the most important qualifications.',
-                                extractiveness='high',
-                                temperature=0.0,
-                            ) 
+
+                            prompt = f"Please extract the most important job requirements from the following job posting and list them in point form: {div.get_text()}."
+                            completion = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}])
+                            response = completion.choices[0].message.content
+
                             # first element is always ""
-                            clean_response = qualificationsResponse.summary.split('- ')[1:]
+                            clean_response = response.split('- ')[1:]
                             return {"contents" : clean_response, 'company': job_data[job_id]['company'], 'job_title': job_data[job_id]['jobTitle']}
                     except Exception as e:
                         print('Request error:', e)