decodingml
diff --git a/‎.docker/Dockerfile.bytewax
+48 b/‎.docker/Dockerfile.bytewax
+48
diff --git a/‎.docker/Dockerfile.cdc
+35 b/‎.docker/Dockerfile.cdc
+35
diff --git a/‎course/module-1/Dockerfile ‎.docker/Dockerfile.crawlers
+7-6 b/‎course/module-1/Dockerfile ‎.docker/Dockerfile.crawlers
+7-6
diff --git a/‎.gitignore
+4 b/‎.gitignore
+4
diff --git a/‎.python-version
+1 b/‎.python-version
+1
diff --git a/‎course/module-1/Readme.md ‎1-data-crawling/README.md b/‎course/module-1/Readme.md ‎1-data-crawling/README.md
diff --git a/‎1-data-crawling/config.py
+12 b/‎1-data-crawling/config.py
+12
diff --git a/‎1-data-crawling/crawlers/__init__.py
+5 b/‎1-data-crawling/crawlers/__init__.py
+5
diff --git a/‎1-data-crawling/crawlers/base.py
+67 b/‎1-data-crawling/crawlers/base.py
+67
diff --git a/‎1-data-crawling/crawlers/github.py
+57 b/‎1-data-crawling/crawlers/github.py
+57
@@ -0,0 +1,48 @@
+# Use an official Python runtime as a parent image
+FROM python:3.11-slim-bullseye
+
+ENV WORKSPACE_ROOT=/usr/src/app \
+    PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    POETRY_HOME="/opt/poetry" \
+    POETRY_NO_INTERACTION=1
+
+RUN mkdir -p $WORKSPACE_ROOT
+
+# Install system dependencies
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends build-essential \
+    gcc \
+    python3-dev \
+    curl \
+    build-essential \
+    && apt-get clean
+
+# Install Poetry
+RUN curl -sSL https://install.python-poetry.org | python -
+
+# Add Poetry to PATH
+ENV PATH="$POETRY_HOME/bin:$PATH"
+
+RUN apt-get remove -y curl
+
+# Copy the pyproject.toml and poetry.lock files from the root directory
+COPY ./pyproject.toml ./poetry.lock ./
+
+# Install dependencies
+RUN poetry config virtualenvs.create false && poetry install
+
+# Set the working directory
+WORKDIR $WORKSPACE_ROOT
+
+# Copy the 3-feature-pipeline and any other necessary directories
+COPY ./3-feature-pipeline .
+COPY ./core ./core
+
+# Set the PYTHONPATH environment variable
+ENV PYTHONPATH=/usr/src/app
+
+RUN chmod +x /usr/src/app/scripts/bytewax_entrypoint.sh
+
+# Command to run the Bytewax pipeline script
+CMD ["/usr/src/app/scripts/bytewax_entrypoint.sh"]
@@ -0,0 +1,35 @@
+# Use an official Python runtime as a parent image
+FROM python:3.11-slim
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    gcc \
+    python3-dev \
+    curl \
+    build-essential \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Poetry
+RUN pip install poetry
+
+# Add Poetry to PATH
+ENV PATH="/etc/poetry/bin:$PATH"
+
+# Set the working directory
+WORKDIR /app
+
+# Copy the pyproject.toml and poetry.lock files from the root directory
+COPY ./pyproject.toml ./poetry.lock ./
+
+# Install dependencies
+RUN poetry install --no-root
+
+# Copy the 2-data-ingestion and core directories
+COPY ./2-data-ingestion ./2-data-ingestion
+
+# Set the PYTHONPATH environment variable
+ENV PYTHONPATH=/app
+
+# Command to run the script
+CMD poetry run python /app/2-data-ingestion/cdc.py && tail -f /dev/null
@@ -44,20 +44,21 @@ RUN yum install -y \
 COPY --from=build /opt/chrome-linux /opt/chrome
 COPY --from=build /opt/chromedriver /opt/
 
-COPY poetry.lock pyproject.toml ./
+COPY ./pyproject.toml ./poetry.lock ./
 
 # Install Poetry, export dependencies to requirements.txt, and install dependencies
 # in the Lambda task directory, finally cleanup manifest files.
-RUN python3 -m pip install --upgrade pip && pip3 install poetry
-RUN poetry export -f requirements.txt > requirements.txt && \
-    pip3 install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
+RUN python -m pip install --upgrade pip && pip install poetry
+RUN poetry export --without 3-feature-pipeline,ml -f requirements.txt > requirements.txt && \
+    pip install --no-cache-dir -r requirements.txt --target "${LAMBDA_TASK_ROOT}" && \
     rm requirements.txt pyproject.toml poetry.lock
 
 # Optional TLS CA only if you plan to store the extracted data into Document DB
 RUN wget https://truststore.pki.rds.amazonaws.com/global/global-bundle.pem -P ${LAMBDA_TASK_ROOT}
+ENV PYTHONPATH=${LAMBDA_TASK_ROOT}/1-data-crawling
 
 # Copy function code
-COPY . ${LAMBDA_TASK_ROOT}
+COPY ./1-data-crawling ${LAMBDA_TASK_ROOT}/1-data-crawling
 
 # Set the CMD to your handler (could also be done as a parameter override outside of the Dockerfile)
-CMD ["main.handler"]
+CMD ["1-data-crawling.main.handler"]
@@ -162,6 +162,10 @@ cython_debug/
 # Ruff
 .ruff_cache
 
+data/
+dataset/
+data
+
 # Data
 output
 .cache
 
@@ -0,0 +1 @@
+3.11.4
@@ -0,0 +1,12 @@
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    model_config = SettingsConfigDict(env_file="../.env", env_file_encoding="utf-8")
+
+    # MongoDB configs
+    MONGO_DATABASE_HOST: str = "mongodb://localhost:30001,localhost:30002,localhost:30003/?replicaSet=my-replica-set"
+    MONGO_DATABASE_NAME: str = "scrabble"
+
+
+settings = Settings()
@@ -0,0 +1,5 @@
+from .github import GithubCrawler
+from .linkedin import LinkedInCrawler
+from .medium import MediumCrawler
+
+__all__ = ["GithubCrawler", "LinkedInCrawler", "MediumCrawler"]
@@ -0,0 +1,67 @@
+import time
+from abc import ABC, abstractmethod
+from tempfile import mkdtemp
+
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+
+from db.documents import BaseDocument
+
+
+class BaseCrawler(ABC):
+    model: type[BaseDocument]
+
+    @abstractmethod
+    def extract(self, link: str, **kwargs) -> None: ...
+
+
+class BaseAbstractCrawler(BaseCrawler, ABC):
+    def __init__(self, scroll_limit: int = 5) -> None:
+        options = webdriver.ChromeOptions()
+        options.binary_location = "/opt/chrome/chrome"
+        options.add_argument("--no-sandbox")
+        options.add_argument("--headless=new")
+        options.add_argument("--single-process")
+        options.add_argument("--disable-dev-shm-usage")
+        options.add_argument("--disable-gpu")
+        options.add_argument("--log-level=3")
+        options.add_argument("--disable-popup-blocking")
+        options.add_argument("--disable-notifications")
+        options.add_argument("--disable-dev-tools")
+        options.add_argument("--ignore-certificate-errors")
+        options.add_argument("--no-zygote")
+        options.add_argument(f"--user-data-dir={mkdtemp()}")
+        options.add_argument(f"--data-path={mkdtemp()}")
+        options.add_argument(f"--disk-cache-dir={mkdtemp()}")
+        options.add_argument("--remote-debugging-port=9222")
+
+        self.set_extra_driver_options(options)
+
+        self.scroll_limit = scroll_limit
+        self.driver = webdriver.Chrome(
+            service=webdriver.ChromeService("/opt/chromedriver"),
+            options=options,
+        )
+
+    def set_extra_driver_options(self, options: Options) -> None:
+        pass
+
+    def login(self) -> None:
+        pass
+
+    def scroll_page(self) -> None:
+        """Scroll through the LinkedIn page based on the scroll limit."""
+        current_scroll = 0
+        last_height = self.driver.execute_script("return document.body.scrollHeight")
+        while True:
+            self.driver.execute_script(
+                "window.scrollTo(0, document.body.scrollHeight);"
+            )
+            time.sleep(5)
+            new_height = self.driver.execute_script("return document.body.scrollHeight")
+            if new_height == last_height or (
+                self.scroll_limit and current_scroll >= self.scroll_limit
+            ):
+                break
+            last_height = new_height
+            current_scroll += 1
@@ -0,0 +1,57 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from aws_lambda_powertools import Logger
+
+from crawlers.base import BaseCrawler
+from db.documents import RepositoryDocument
+
+logger = Logger(service="llm-twin-course/crawler")
+
+
+class GithubCrawler(BaseCrawler):
+    model = RepositoryDocument
+
+    def __init__(self, ignore=(".git", ".toml", ".lock", ".png")) -> None:
+        super().__init__()
+        self._ignore = ignore
+
+    def extract(self, link: str, **kwargs) -> None:
+        logger.info(f"Starting scrapping GitHub repository: {link}")
+
+        repo_name = link.rstrip("/").split("/")[-1]
+
+        local_temp = tempfile.mkdtemp()
+
+        try:
+            os.chdir(local_temp)
+            subprocess.run(["git", "clone", link])
+
+            repo_path = os.path.join(local_temp, os.listdir(local_temp)[0])
+
+            tree = {}
+            for root, dirs, files in os.walk(repo_path):
+                dir = root.replace(repo_path, "").lstrip("/")
+                if dir.startswith(self._ignore):
+                    continue
+
+                for file in files:
+                    if file.endswith(self._ignore):
+                        continue
+                    file_path = os.path.join(dir, file)
+                    with open(os.path.join(root, file), "r", errors="ignore") as f:
+                        tree[file_path] = f.read().replace(" ", "")
+
+            instance = self.model(
+                name=repo_name, link=link, content=tree, owner_id=kwargs.get("user")
+            )
+            instance.save()
+
+        except Exception:
+            raise
+        finally:
+            shutil.rmtree(local_temp)
+
+        logger.info(f"Finished scrapping GitHub repository: {link}")