Skip to content

Commit

Permalink
Merge pull request #210 from potpie-ai/java
Browse files Browse the repository at this point in the history
Enhanced java parsing support
  • Loading branch information
dhirenmathur authored Dec 13, 2024
2 parents ad79761 + f84508f commit e889a4f
Show file tree
Hide file tree
Showing 9 changed files with 272 additions and 234 deletions.
5 changes: 2 additions & 3 deletions app/modules/code_provider/github/github_router.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,18 @@ async def get_user_repos(
):
user_repo_list = await GithubController(db).get_user_repos(user=user)
user_repo_list["repositories"].extend(config_provider.get_demo_repo_list())

# Remove duplicates while preserving order
seen = set()
deduped_repos = []
for repo in reversed(user_repo_list["repositories"]):
# Create tuple of values to use as hash key
repo_key = repo["full_name"]


if repo_key not in seen:
seen.add(repo_key)
deduped_repos.append(repo)

user_repo_list["repositories"] = deduped_repos
return user_repo_list

Expand Down
54 changes: 35 additions & 19 deletions app/modules/parsing/graph_construction/code_graph_service.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import hashlib
import logging
import time
from typing import Dict, Optional

from neo4j import GraphDatabase
Expand Down Expand Up @@ -43,10 +44,8 @@ def create_and_store_graph(self, repo_dir, project_id, user_id):
nx_graph = self.repo_map.create_graph(repo_dir)

with self.driver.session() as session:
# Create nodes
import time

start_time = time.time() # Start timing
start_time = time.time()
node_count = nx_graph.number_of_nodes()
logging.info(f"Creating {node_count} nodes")

Expand All @@ -55,25 +54,42 @@ def create_and_store_graph(self, repo_dir, project_id, user_id):
for i in range(0, node_count, batch_size):
batch_nodes = list(nx_graph.nodes(data=True))[i : i + batch_size]
nodes_to_create = []
for node in batch_nodes:
node_type = node[1].get("type")
label = node_type.capitalize() if node_type else "UNKNOWN"
node_data = {
"name": node[0],
"file_path": node[1].get("file", ""),
"start_line": node[1].get("line", -1),
"end_line": node[1].get("end_line", -1),

for node_id, node_data in batch_nodes:
# Get the node type and ensure it's one of our expected types
node_type = node_data.get("type", "UNKNOWN")
if node_type == "UNKNOWN":
continue
# Initialize labels with NODE
labels = ["NODE"]

# Add specific type label if it's a valid type
if node_type in ["FILE", "CLASS", "FUNCTION", "INTERFACE"]:
labels.append(node_type)

# Prepare node data
processed_node = {
"name": node_data.get(
"name", node_id
), # Use node_id as fallback
"file_path": node_data.get("file", ""),
"start_line": node_data.get("line", -1),
"end_line": node_data.get("end_line", -1),
"repoId": project_id,
"node_id": CodeGraphService.generate_node_id(node[0], user_id),
"node_id": CodeGraphService.generate_node_id(node_id, user_id),
"entityId": user_id,
"type": node_type if node_type else "Unknown",
"text": node[1].get("text", ""),
"labels": ["NODE", label],
"type": node_type,
"text": node_data.get("text", ""),
"labels": labels,
}

# Remove None values
processed_node = {
k: v for k, v in processed_node.items() if v is not None
}
# Remove any null values from node_data
node_data = {k: v for k, v in node_data.items() if v is not None}
nodes_to_create.append(node_data)
nodes_to_create.append(processed_node)

# Create nodes with labels
session.run(
"""
UNWIND $nodes AS node
Expand Down Expand Up @@ -112,7 +128,7 @@ def create_and_store_graph(self, repo_dir, project_id, user_id):
edges=edges_to_create,
)

end_time = time.time() # End timing
end_time = time.time()
logging.info(
f"Time taken to create graph and search index: {end_time - start_time:.2f} seconds"
)
Expand Down
27 changes: 17 additions & 10 deletions app/modules/parsing/graph_construction/parsing_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ async def parse_directory(
project = await project_manager.get_project_from_db(
repo_name, repo_details.branch_name, user_id
)

# First check if this is a demo project that hasn't been accessed by this user yet
if not project and repo_details.repo_name in demo_repos:
existing_project = await project_manager.get_global_project_from_db(
Expand Down Expand Up @@ -134,24 +134,28 @@ async def parse_directory(
project_manager,
db,
)

# Handle existing projects (including previously duplicated demo projects)
if project:
project_id = project.id
is_latest = await parse_helper.check_commit_status(project_id)

if not is_latest or project.status != ProjectStatusEnum.READY.value:
cleanup_graph = True
logger.info(f"Submitting parsing task for existing project {project_id}")
logger.info(
f"Submitting parsing task for existing project {project_id}"
)
process_parsing.delay(
repo_details.model_dump(),
user_id,
user_email,
project_id,
cleanup_graph,
)

await project_manager.update_project_status(project_id, ProjectStatusEnum.SUBMITTED)

await project_manager.update_project_status(
project_id, ProjectStatusEnum.SUBMITTED
)
PostHogClient().send_event(
user_id,
"parsed_repo_event",
Expand All @@ -161,8 +165,11 @@ async def parse_directory(
"project_id": project_id,
},
)
return {"project_id": project_id, "status": ProjectStatusEnum.SUBMITTED.value}

return {
"project_id": project_id,
"status": ProjectStatusEnum.SUBMITTED.value,
}

return {"project_id": project_id, "status": project.status}
else:
# Handle new non-demo projects
Expand Down Expand Up @@ -195,9 +202,9 @@ async def handle_new_project(
}

logger.info(f"Submitting parsing task for new project {new_project_id}")

repo_name = repo_details.repo_name or repo_details.repo_path.split("/")[-1]
await project_manager.register_project(
repo_details.repo_name,
repo_name,
repo_details.branch_name,
user_id,
new_project_id,
Expand Down
Loading

0 comments on commit e889a4f

Please sign in to comment.