diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..94f480de94e1d767531580401cbf13844868e82b --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +* text=auto eol=lf \ No newline at end of file diff --git a/collector/scripts/sitemap.py b/collector/scripts/sitemap.py index e780bd9cc9fbc847b08684186420375cf30d3122..f1c3b45cc8e44c5dcfb2e4c6323ce20037048b90 100644 --- a/collector/scripts/sitemap.py +++ b/collector/scripts/sitemap.py @@ -1,39 +1,39 @@ -import requests -import xml.etree.ElementTree as ET -from scripts.link import parse_links -import re - -def parse_sitemap(url): - response = requests.get(url) - root = ET.fromstring(response.content) - - urls = [] - for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'): - for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'): - if not has_extension_to_ignore(loc.text): - urls.append(loc.text) - else: - print(f"Skipping filetype: {loc.text}") - - return urls - -# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml -def sitemap(): - sitemap_url = input("Enter the URL of the sitemap: ") - - if(len(sitemap_url) == 0): - print("No valid sitemap provided!") - exit(1) - - url_array = parse_sitemap(sitemap_url) - - #parse links from array - parse_links(url_array) - -def has_extension_to_ignore(string): - image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf'] - - pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b' - match = re.search(pattern, string, re.IGNORECASE) - +import requests +import xml.etree.ElementTree as ET +from scripts.link import parse_links +import re + +def parse_sitemap(url): + response = requests.get(url) + root = ET.fromstring(response.content) + + urls = [] + for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'): + for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'): + if not has_extension_to_ignore(loc.text): + urls.append(loc.text) + else: + print(f"Skipping filetype: {loc.text}") + + return urls + +# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml +def sitemap(): + sitemap_url = input("Enter the URL of the sitemap: ") + + if(len(sitemap_url) == 0): + print("No valid sitemap provided!") + exit(1) + + url_array = parse_sitemap(sitemap_url) + + #parse links from array + parse_links(url_array) + +def has_extension_to_ignore(string): + image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf'] + + pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b' + match = re.search(pattern, string, re.IGNORECASE) + return match is not None \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile index f69f041ec7432e281739915849aeea6daca65343..1625263ddb3cae062ea4724b5f4970adc9ddd426 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -34,12 +34,10 @@ RUN groupadd -g $ARG_GID anythingllm && \ # Copy docker helper scripts COPY ./docker/docker-entrypoint.sh /usr/local/bin/ COPY ./docker/docker-healthcheck.sh /usr/local/bin/ -COPY ./docker/dual_boot.sh /usr/local/bin/ # Ensure the scripts are executable RUN chmod +x /usr/local/bin/docker-entrypoint.sh && \ - chmod +x /usr/local/bin/docker-healthcheck.sh && \ - chmod 777 /usr/local/bin/dual_boot.sh + chmod +x /usr/local/bin/docker-healthcheck.sh USER anythingllm @@ -91,6 +89,4 @@ HEALTHCHECK --interval=1m --timeout=10s --start-period=1m \ CMD /bin/bash /usr/local/bin/docker-healthcheck.sh || exit 1 # Run the server -ENTRYPOINT ["docker-entrypoint.sh"] - -CMD /bin/bash /usr/local/bin/dual_boot.sh \ No newline at end of file +ENTRYPOINT ["/bin/bash", "/usr/local/bin/docker-entrypoint.sh"] \ No newline at end of file diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index 97cfd9d2e5f422a1ed2f2f70dca8ce28f6c39243..37587178a2f745ddbe2d6fa471bfef134259c4cb 100755 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -1,3 +1,5 @@ -#!/usr/bin/env bash - -exec "$@" \ No newline at end of file +#!/bin/bash +node /app/server/index.js & +{ FLASK_ENV=production FLASK_APP=wsgi.py cd collector && gunicorn --workers 4 --bind 0.0.0.0:8888 wsgi:api; } & +wait -n +exit $? \ No newline at end of file diff --git a/docker/dual_boot.sh b/docker/dual_boot.sh deleted file mode 100644 index 37587178a2f745ddbe2d6fa471bfef134259c4cb..0000000000000000000000000000000000000000 --- a/docker/dual_boot.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash -node /app/server/index.js & -{ FLASK_ENV=production FLASK_APP=wsgi.py cd collector && gunicorn --workers 4 --bind 0.0.0.0:8888 wsgi:api; } & -wait -n -exit $? \ No newline at end of file