Skip to content

Commit

Permalink
fuck cohere
Browse files Browse the repository at this point in the history
  • Loading branch information
vaughanlove committed Nov 28, 2023
1 parent 4b19e98 commit 35482b2
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 232 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
venv
.env.dev
**/__pycache__/**
**/__pycache__/**
.env.local
179 changes: 67 additions & 112 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from typing import Union, Annotated, List
import os
import numpy as np
from llmsherpa.readers import LayoutPDFReader
import json
import re
import cohere
from fastapi import FastAPI, File, UploadFile, Request, Depends
from fastapi.middleware.cors import CORSMiddleware
from fastapi.security import OAuth2PasswordBearer
Expand All @@ -10,31 +12,25 @@
from dotenv import find_dotenv
from dotenv import load_dotenv
from pydantic import BaseModel
from app.scrape.ScrapeClient import ScrapingClient
from app.cohere.CohereClient import CohereClient
import io
from openai import OpenAI

env_file = find_dotenv(".env.dev")
load_dotenv(env_file)

COHERE_API_KEY = os.getenv("CO_API_KEY")
SUPABASE_URL: str = os.environ.get("SUPABASE_URL")
SUPABASE_API_KEY: str = os.environ.get("SUPABASE_KEY")
SCRAPE_CLIENT_API_KEY = os.environ.get("SCRAPE_API_KEY")

co = CohereClient(COHERE_API_KEY)
supabase: Client = create_client(SUPABASE_URL, SUPABASE_API_KEY)
scrape_client = ScrapingClient(scrapeops_api_key=SCRAPE_CLIENT_API_KEY, num_concurrent_threads=5)

# llm sherpa for reading pdfs
llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"
pdf_reader = LayoutPDFReader(llmsherpa_api_url)

class URLPayload(BaseModel):
url: str

class ModelPayload(BaseModel):
model: str


env_file = find_dotenv(".env.dev")
load_dotenv(env_file)

url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

client = OpenAI()

app = FastAPI()

oauth2_scheme = OAuth2PasswordBearer(tokenUrl="token")
Expand All @@ -54,7 +50,7 @@ class ModelPayload(BaseModel):

@app.get("/")
def read_root():
return {"Hello": "World"}
return {"Hello": "GeCover"}

@app.post("/extract_url/")
async def extract_url(payload: URLPayload, token: Annotated[str, Depends(oauth2_scheme)]): #, token: Annotated[str, Depends(oauth2_scheme)]
Expand All @@ -70,106 +66,65 @@ async def extract_url(payload: URLPayload, token: Annotated[str, Depends(oauth2_
else:
return {"error" : "Unable to extract job data from URL!"}

@app.post("/read_pdf/")
async def read_pdf(file: Annotated[bytes, File()], token: Annotated[str, Depends(oauth2_scheme)]):

# get user data from JWT
@app.post("/generate_paragraphs/")
def generate_paragraphs(file: Annotated[bytes, File()], requirements: str, token: Annotated[str, Depends(oauth2_scheme)]):#, token: Annotated[str, Depends(oauth2_scheme)]
# # get user data from JWT
data = supabase.auth.get_user(token)

# assert that the user is authenticated.
# # assert that the user is authenticated.
assert data.user.aud == 'authenticated', "402: not authenticated."

# the path_or_url is fake, ignored when contents is set.
try:
content = pdf_reader.read_pdf(path_or_url="https://someexapmple.com/myfile.pdf", contents=file)
# content = client.detect_document_text(Document={'Bytes': file})
file = openai.files.create(
file=file,
purpose="assistants"
)
except:
# very mid error handling
return {"contents" : []}
return {"id" : None}

docs = []
for section in content.sections():
docs.append(section.to_text(include_children=True, recurse=True))

return {"contents": docs }
# input_credentials = ("\n - ").join(requirements)

@app.post("/generate_paragraphs/")
def generate_paragraphs(requirements: List[str], resume_documents: List[str], model: ModelPayload, token: Annotated[str, Depends(oauth2_scheme)]):#, token: Annotated[str, Depends(oauth2_scheme)]
# # get user data from JWT
data = supabase.auth.get_user(token)
# # assert that the user is authenticated.
assert data.user.aud == 'authenticated', "402: not authenticated."

documents = []

for doc in resume_documents:
documents.append({"snippet" : doc})

queries = []

for i, req in enumerate(requirements):
query = f"""
You are acting as a personal professional writer.
Explain in two sentences about how I satisfy the following job requirement written in the first person:
note: Do not act as a chat bot. Do not preface the response with "sure, here is that summary:".
note: Do not finish the paragraph with anything like "anything else I can help with?" or "is there anything else you would like to know?".
note: If you don't have the information, do not output things like [Company name] or [first name] placeholders.
Reference the documents provided that contain information about me. Be positive and enthusiastic!
Job requirement:
{req}
Summary of why I satisfy the job requirement in 2-3 sentences:
"""
queries.append(query)
#print('QUERIES: ', queries)

with ThreadPoolExecutor(max_workers=len(requirements)) as executor:
futures = [executor.submit(co.chat, query, documents=documents) for query in queries]
responses = [future.result().text for future in futures]

#Encode your documents with input type 'search_document'
doc_emb = co.embed(responses)
doc_emb = np.asarray(doc_emb)

query = """ The most important job requirement to satisfy."""

rerank_hits = co.rerank(query=query, documents=responses, top_n=min(len(documents), 5))

rerank_results = [x.document['text'] for x in rerank_hits]

# reverse it, because LLMs have a recency bias
rerank_results.reverse()

# rerank_hits.reverse()
input_credentials = ("\n - ").join(rerank_results)

if model == 'altman':
para_one_prompt = f"""
You are acting as a personal professional writer.
note: DO NOT prompt the user as a chat bot. Don't repeat skills once you have said them.
note: Make it a maximum of two paragraphs.
note: Please only output the paragraph. Do not preface the paragraphs with "sure, here are those paragraphs:". Do not finish the paragraph with anything like "anything else I can help with?".
note: if you don't have the information, do not output tokens like "[Company name]" or "[first name]" as placeholders.
Write in first person, and be positive and enthusiastic!
The points to summarize:
- {input_credentials}
First person summary:
"""
result = co.generate(model='e1f1b8c8-f87a-4fd3-9346-99068e5b7036-ft', prompt=para_one_prompt, k=25, temperature=0.96, frequency_penalty=0.2, num_generations=1)
return {'para_A' : result.data[0], 'para_B' : 'result.data[1]'}
prompt = f"""
- {requirements}
else:
response = co.summarize(
text=input_credentials,
format="paragraph",
temperature=0.96,
length='long',
model='command-nightly',
extractiveness='auto',
additional_command='Generate a summary of the first person credentials being provided. The summary should maintain the first-person prose. Remove all open-ended questions and placeholder tokens like [company name] or [first name] as examples.'
Could you write me a couple paragraphs without an introduction/outro about why I am the right candidate for the job? The document you have access to is my CV.
"""

print(prompt)

thread = openai.beta.threads.create(
messages=[
{
"role": "user",
"content": prompt,
"file_ids": [file.id]
}
]
)

run = openai.beta.threads.runs.create(
thread_id=thread.id,
assistant_id="asst_C0GRyfBLNOXtrxlOPpA4ouvr",
)

while True:
run = openai.beta.threads.runs.retrieve(
thread_id=thread.id,
run_id=run.id
)
return {'para_A' : response.summary, 'para_B' : 'result.data[1]'}
if run.status == 'completed':
break

time.sleep(3)

messages = openai.beta.threads.messages.list(
thread_id=thread.id
)

delete_file = openai.files.delete(file.id)
delete_thread = openai.beta.threads.delete(thread.id)

return {'para_A' : "something broke", 'para_B' : 'result.data[1]'}
# print(messages.data[0].content[0].text.value)
# print(delete_thread)
return {'para_A' : messages.data[0].content[0].text.value, 'para_B' : 'result.data[1]'}
41 changes: 13 additions & 28 deletions app/scrape/ScrapeClient.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import logging
from dotenv import load_dotenv, find_dotenv
from app.cohere.CohereClient import CohereClient
from openai import OpenAI

# logging.basicConfig()
# logging.getLogger().setLevel(logging.DEBUG)
Expand All @@ -24,8 +25,7 @@
if dotenv_path:
load_dotenv(dotenv_path)

COHERE_API_KEY = os.getenv("CO_API_KEY")
co = CohereClient(COHERE_API_KEY)
client = OpenAI()

class ScrapingClient:

Expand Down Expand Up @@ -110,20 +110,12 @@ def send_request(self, url, method='GET', scrapeops_proxy_settings=None, **kwarg
soup = bs.BeautifulSoup(source,'lxml')
div = soup.find("div", class_ = "show-more-less-html__markup show-more-less-html__markup--clamp-after-5 relative overflow-hidden" )
if div:
# summarize with cohere
# tempurature zero for the time being.
# keeping it at zero allows us to better experiment and tweak things, knowing the LLM is a control.
response = co.summarize(
text=div.get_text(),
length='short',
format='bullets',
model='command',
additional_command='extract the most important qualifications.',
extractiveness='high',
temperature=0.0,
)
prompt = f"Please extract the most important job requirements from the following job posting and list them in point form: {div.get_text()}."
completion = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}])
response = completion.choices[0].message.content

# first element is always ""
clean_response = response.summary.split('- ')[1:]
clean_response = response.split('- ')[1:]

company = soup.find("a", class_="topcard__org-name-link topcard__flavor--black-link").get_text()
pattern = r"(?<=\n)(.*?)(?=\n)"
Expand Down Expand Up @@ -175,20 +167,13 @@ def is_qualification_or_requirement_header(tag):

for qualification in qualifications:
print('qualifcation: ', qualification)
# summarize with cohere
# tempurature zero for the time being.
# keeping it at zero allows us to better experiment and tweak things, knowing the LLM is a control.
qualificationsResponse = co.summarize(
text=', '.join(qualifications),
length='short',
format='bullets',
model='command',
additional_command='extract the most important qualifications.',
extractiveness='high',
temperature=0.0,
)

prompt = f"Please extract the most important job requirements from the following job posting and list them in point form: {div.get_text()}."
completion = client.chat.completions.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": prompt}])
response = completion.choices[0].message.content

# first element is always ""
clean_response = qualificationsResponse.summary.split('- ')[1:]
clean_response = response.split('- ')[1:]
return {"contents" : clean_response, 'company': job_data[job_id]['company'], 'job_title': job_data[job_id]['jobTitle']}
except Exception as e:
print('Request error:', e)
Expand Down
Loading

0 comments on commit 35482b2

Please sign in to comment.