Skip to content

Commit bd5f14d

Browse files
Restructure repository (#26)
* retruscture * 5-inference added * 5-inference added * added .gitignore * refactoring * Solved Embeding insertion into qdrant vector index * Added pulimi remaining resources --------- Co-authored-by: Vesa Alexandru <vesaalexandru95@gmail.com>
1 parent 8590e4e commit bd5f14d

File tree

132 files changed

+12172
-352
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

132 files changed

+12172
-352
lines changed

.docker/Dockerfile.bytewax

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Use an official Python runtime as a parent image
2+
FROM python:3.11-slim-bullseye
3+
4+
ENV WORKSPACE_ROOT=/usr/src/app \
5+
PYTHONDONTWRITEBYTECODE=1 \
6+
PYTHONUNBUFFERED=1 \
7+
POETRY_HOME="/opt/poetry" \
8+
POETRY_NO_INTERACTION=1
9+
10+
RUN mkdir -p $WORKSPACE_ROOT
11+
12+
# Install system dependencies
13+
RUN apt-get update -y \
14+
&& apt-get install -y --no-install-recommends build-essential \
15+
gcc \
16+
python3-dev \
17+
curl \
18+
build-essential \
19+
&& apt-get clean
20+
21+
# Install Poetry
22+
RUN curl -sSL https://install.python-poetry.org | python -
23+
24+
# Add Poetry to PATH
25+
ENV PATH="$POETRY_HOME/bin:$PATH"
26+
27+
RUN apt-get remove -y curl
28+
29+
# Copy the pyproject.toml and poetry.lock files from the root directory
30+
COPY ./pyproject.toml ./poetry.lock ./
31+
32+
# Install dependencies
33+
RUN poetry config virtualenvs.create false && poetry install
34+
35+
# Set the working directory
36+
WORKDIR $WORKSPACE_ROOT
37+
38+
# Copy the 3-feature-pipeline and any other necessary directories
39+
COPY ./3-feature-pipeline .
40+
COPY ./core ./core
41+
42+
# Set the PYTHONPATH environment variable
43+
ENV PYTHONPATH=/usr/src/app
44+
45+
RUN chmod +x /usr/src/app/scripts/bytewax_entrypoint.sh
46+
47+
# Command to run the Bytewax pipeline script
48+
CMD ["/usr/src/app/scripts/bytewax_entrypoint.sh"]

.docker/Dockerfile.cdc

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Use an official Python runtime as a parent image
2+
FROM python:3.11-slim
3+
4+
# Install system dependencies
5+
RUN apt-get update && apt-get install -y \
6+
gcc \
7+
python3-dev \
8+
curl \
9+
build-essential \
10+
&& apt-get clean \
11+
&& rm -rf /var/lib/apt/lists/*
12+
13+
# Install Poetry
14+
RUN pip install poetry
15+
16+
# Add Poetry to PATH
17+
ENV PATH="/etc/poetry/bin:$PATH"
18+
19+
# Set the working directory
20+
WORKDIR /app
21+
22+
# Copy the pyproject.toml and poetry.lock files from the root directory
23+
COPY ./pyproject.toml ./poetry.lock ./
24+
25+
# Install dependencies
26+
RUN poetry install --no-root
27+
28+
# Copy the 2-data-ingestion and core directories
29+
COPY ./2-data-ingestion ./2-data-ingestion
30+
31+
# Set the PYTHONPATH environment variable
32+
ENV PYTHONPATH=/app
33+
34+
# Command to run the script
35+
CMD poetry run python /app/2-data-ingestion/cdc.py && tail -f /dev/null

course/module-1/Dockerfile .docker/Dockerfile.crawlers

+7-6
Original file line numberDiff line numberDiff line change
@@ -44,20 +44,21 @@ RUN yum install -y \
4444
COPY --from=build /opt/chrome-linux /opt/chrome
4545
COPY --from=build /opt/chromedriver /opt/
4646

47-
COPY poetry.lock pyproject.toml ./
47+
COPY ./pyproject.toml ./poetry.lock ./
4848

4949
# Install Poetry, export dependencies to requirements.txt, and install dependencies
5050
# in the Lambda task directory, finally cleanup manifest files.
51-
RUN python3 -m pip install --upgrade pip && pip3 install poetry
52-
RUN poetry export -f requirements.txt > requirements.txt && \
53-
pip3 install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
51+
RUN python -m pip install --upgrade pip && pip install poetry
52+
RUN poetry export --without 3-feature-pipeline,ml -f requirements.txt > requirements.txt && \
53+
pip install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
5454
rm requirements.txt pyproject.toml poetry.lock
5555

5656
# Optional TLS CA only if you plan to store the extracted data into Document DB
5757
RUN wget https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -P ${LAMBDA_TASK_ROOT}
58+
ENV PYTHONPATH=${LAMBDA_TASK_ROOT}/1-data-crawling
5859

5960
# Copy function code
60-
COPY . ${LAMBDA_TASK_ROOT}
61+
COPY ./1-data-crawling ${LAMBDA_TASK_ROOT}/1-data-crawling
6162

6263
# Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
63-
CMD ["main.handler"]
64+
CMD ["1-data-crawling.main.handler"]

.gitignore

+4
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ cython_debug/
162162
# Ruff
163163
.ruff_cache
164164

165+
data/
166+
dataset/
167+
data
168+
165169
# Data
166170
output
167171
.cache

.python-version

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.11.4
File renamed without changes.

1-data-crawling/config.py

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from pydantic_settings import BaseSettings, SettingsConfigDict
2+
3+
4+
class Settings(BaseSettings):
5+
model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")
6+
7+
# MongoDB configs
8+
MONGO_DATABASE_HOST: str = "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set"
9+
MONGO_DATABASE_NAME: str = "scrabble"
10+
11+
12+
settings = Settings()

1-data-crawling/crawlers/__init__.py

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from .github import GithubCrawler
2+
from .linkedin import LinkedInCrawler
3+
from .medium import MediumCrawler
4+
5+
__all__ = ["GithubCrawler", "LinkedInCrawler", "MediumCrawler"]

1-data-crawling/crawlers/base.py

+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
import time
2+
from abc import ABC, abstractmethod
3+
from tempfile import mkdtemp
4+
5+
from selenium import webdriver
6+
from selenium.webdriver.chrome.options import Options
7+
8+
from db.documents import BaseDocument
9+
10+
11+
class BaseCrawler(ABC):
12+
model: type[BaseDocument]
13+
14+
@abstractmethod
15+
def extract(self, link: str, **kwargs) -> None: ...
16+
17+
18+
class BaseAbstractCrawler(BaseCrawler, ABC):
19+
def __init__(self, scroll_limit: int = 5) -> None:
20+
options = webdriver.ChromeOptions()
21+
options.binary_location = "/opt/chrome/chrome"
22+
options.add_argument("--no-sandbox")
23+
options.add_argument("--headless=new")
24+
options.add_argument("--single-process")
25+
options.add_argument("--disable-dev-shm-usage")
26+
options.add_argument("--disable-gpu")
27+
options.add_argument("--log-level=3")
28+
options.add_argument("--disable-popup-blocking")
29+
options.add_argument("--disable-notifications")
30+
options.add_argument("--disable-dev-tools")
31+
options.add_argument("--ignore-certificate-errors")
32+
options.add_argument("--no-zygote")
33+
options.add_argument(f"--user-data-dir={mkdtemp()}")
34+
options.add_argument(f"--data-path={mkdtemp()}")
35+
options.add_argument(f"--disk-cache-dir={mkdtemp()}")
36+
options.add_argument("--remote-debugging-port=9222")
37+
38+
self.set_extra_driver_options(options)
39+
40+
self.scroll_limit = scroll_limit
41+
self.driver = webdriver.Chrome(
42+
service=webdriver.ChromeService("/opt/chromedriver"),
43+
options=options,
44+
)
45+
46+
def set_extra_driver_options(self, options: Options) -> None:
47+
pass
48+
49+
def login(self) -> None:
50+
pass
51+
52+
def scroll_page(self) -> None:
53+
"""Scroll through the LinkedIn page based on the scroll limit."""
54+
current_scroll = 0
55+
last_height = self.driver.execute_script("return document.body.scrollHeight")
56+
while True:
57+
self.driver.execute_script(
58+
"window.scrollTo(0, document.body.scrollHeight);"
59+
)
60+
time.sleep(5)
61+
new_height = self.driver.execute_script("return document.body.scrollHeight")
62+
if new_height == last_height or (
63+
self.scroll_limit and current_scroll >= self.scroll_limit
64+
):
65+
break
66+
last_height = new_height
67+
current_scroll += 1

1-data-crawling/crawlers/github.py

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import os
2+
import shutil
3+
import subprocess
4+
import tempfile
5+
6+
from aws_lambda_powertools import Logger
7+
8+
from crawlers.base import BaseCrawler
9+
from db.documents import RepositoryDocument
10+
11+
logger = Logger(service="llm-twin-course/crawler")
12+
13+
14+
class GithubCrawler(BaseCrawler):
15+
model = RepositoryDocument
16+
17+
def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
18+
super().__init__()
19+
self._ignore = ignore
20+
21+
def extract(self, link: str, **kwargs) -> None:
22+
logger.info(f"Starting scrapping GitHub repository: {link}")
23+
24+
repo_name = link.rstrip("/").split("/")[-1]
25+
26+
local_temp = tempfile.mkdtemp()
27+
28+
try:
29+
os.chdir(local_temp)
30+
subprocess.run(["git", "clone", link])
31+
32+
repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
33+
34+
tree = {}
35+
for root, dirs, files in os.walk(repo_path):
36+
dir = root.replace(repo_path, "").lstrip("/")
37+
if dir.startswith(self._ignore):
38+
continue
39+
40+
for file in files:
41+
if file.endswith(self._ignore):
42+
continue
43+
file_path = os.path.join(dir, file)
44+
with open(os.path.join(root, file), "r", errors="ignore") as f:
45+
tree[file_path] = f.read().replace(" ", "")
46+
47+
instance = self.model(
48+
name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
49+
)
50+
instance.save()
51+
52+
except Exception:
53+
raise
54+
finally:
55+
shutil.rmtree(local_temp)
56+
57+
logger.info(f"Finished scrapping GitHub repository: {link}")

0 commit comments

Comments
 (0)