diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..51b2d84 Binary files /dev/null and b/.DS_Store differ diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..86455d3 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +.git/ +venv/ +test/ diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..e674a22 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,9 @@ +# These are supported funding model platforms +github: benbusby +ko_fi: benbusby +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..a174b78 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,45 @@ +--- +name: Bug report +about: Create a bug report to help fix an issue with Whoogle +title: "[BUG] " +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Deployment Method** +- [ ] Heroku (one-click deploy) +- [ ] Docker +- [ ] `run` executable +- [ ] pip/pipx +- [ ] Other: [describe setup] + +**Version of Whoogle Search** +- [ ] Latest build from [source] (i.e. GitHub, Docker Hub, pip, etc) +- [ ] Version [version number] +- [ ] Not sure + + +**Desktop (please complete the following information):** + - OS: [e.g. iOS] + - Browser [e.g. chrome, safari] + - Version [e.g. 22] + +**Smartphone (please complete the following information):** + - Device: [e.g. iPhone6] + - OS: [e.g. iOS8.1] + - Browser [e.g. stock browser, safari] + - Version [e.g. 22] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..f91a033 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest a feature that would improve Whoogle +title: "[FEATURE] " +labels: enhancement +assignees: '' + +--- + + + +**Describe the feature you'd like to see added** +A short description of the feature, and what it would accomplish. + +**Additional context** +Add any other context or screenshots about the feature request here. diff --git a/.github/ISSUE_TEMPLATE/new-theme.md b/.github/ISSUE_TEMPLATE/new-theme.md new file mode 100644 index 0000000..9653b09 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/new-theme.md @@ -0,0 +1,38 @@ +--- +name: New theme +about: Create a new theme for Whoogle +title: "[THEME] " +labels: theme +assignees: benbusby + +--- + +Use the following template to design your theme, replacing the blank spaces with the colors of your choice. + +```css +:root { + /* LIGHT THEME COLORS */ + --whoogle-logo: #______; + --whoogle-page-bg: #______; + --whoogle-element-bg: #______; + --whoogle-text: #______; + --whoogle-contrast-text: #______; + --whoogle-secondary-text: #______; + --whoogle-result-bg: #______; + --whoogle-result-title: #______; + --whoogle-result-url: #______; + --whoogle-result-visited: #______; + + /* DARK THEME COLORS */ + --whoogle-dark-logo: #______; + --whoogle-dark-page-bg: #______; + --whoogle-dark-element-bg: #______; + --whoogle-dark-text: #______; + --whoogle-dark-contrast-text: #______; + --whoogle-dark-secondary-text: #______; + --whoogle-dark-result-bg: #______; + --whoogle-dark-result-title: #______; + --whoogle-dark-result-url: #______; + --whoogle-dark-result-visited: #______; +} +``` diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..a1d9b21 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,10 @@ +--- +name: Question +about: Ask a (simple) question about Whoogle +title: "[QUESTION] " +labels: question +assignees: '' + +--- + +Type out your question here. Please make sure that this is a topic that isn't already covered in the README. diff --git a/.github/workflows/buildx.yml b/.github/workflows/buildx.yml new file mode 100644 index 0000000..b08826d --- /dev/null +++ b/.github/workflows/buildx.yml @@ -0,0 +1,59 @@ +name: buildx + +on: + workflow_run: + workflows: ["docker_main"] + branches: [main] + types: + - completed + push: + tags: + - '*' + +jobs: + on-success: + runs-on: ubuntu-latest + steps: + - name: Wait for tests to succeed + if: ${{ github.event.workflow_run.conclusion != 'success' && startsWith(github.ref, 'refs/tags') != true }} + run: exit 1 + - name: checkout code + uses: actions/checkout@v2 + - name: install buildx + id: buildx + uses: crazy-max/ghaction-docker-buildx@v1 + with: + version: latest + - name: Login to Docker Hub + uses: docker/login-action@v1 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + - name: Login to ghcr.io + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: build and push the image + if: startsWith(github.ref, 'refs/heads/main') && github.actor == 'benbusby' + run: | + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + docker buildx ls + docker buildx build --push \ + --tag benbusby/whoogle-search:latest \ + --platform linux/amd64,linux/arm64 . + docker buildx build --push \ + --tag ghcr.io/benbusby/whoogle-search:latest \ + --platform linux/amd64,linux/arm64 . + - name: build and push tag + if: startsWith(github.ref, 'refs/tags') + run: | + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + docker buildx ls + docker buildx build --push \ + --tag benbusby/whoogle-search:${GITHUB_REF#refs/*/v}\ + --platform linux/amd64,linux/arm/v7,linux/arm64 . + docker buildx build --push \ + --tag ghcr.io/benbusby/whoogle-search:${GITHUB_REF#refs/*/v}\ + --platform linux/amd64,linux/arm/v7,linux/arm64 . diff --git a/.github/workflows/docker_main.yml b/.github/workflows/docker_main.yml new file mode 100644 index 0000000..f369f47 --- /dev/null +++ b/.github/workflows/docker_main.yml @@ -0,0 +1,28 @@ +name: docker_main + +on: + workflow_run: + workflows: ["tests"] + branches: [main] + types: + - completed + +# TODO: Needs refactoring to use reusable workflows and share w/ docker_tests +jobs: + on-success: + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v2 + - name: build and test (docker) + run: | + docker build --tag whoogle-search:test . + docker run --publish 5000:5000 --detach --name whoogle-search-nocompose whoogle-search:test + sleep 15 + docker exec whoogle-search-nocompose curl -f http://localhost:5000/healthz || exit 1 + - name: build and test (docker-compose) + run: | + docker rm -f whoogle-search-nocompose + WHOOGLE_IMAGE="whoogle-search:test" docker compose up --detach + sleep 15 + docker exec whoogle-search curl -f http://localhost:5000/healthz || exit 1 diff --git a/.github/workflows/docker_tests.yml b/.github/workflows/docker_tests.yml new file mode 100644 index 0000000..fd96b9d --- /dev/null +++ b/.github/workflows/docker_tests.yml @@ -0,0 +1,26 @@ +name: docker_tests + +on: + push: + branches: main + pull_request: + branches: main + +jobs: + docker: + runs-on: ubuntu-latest + steps: + - name: checkout code + uses: actions/checkout@v2 + - name: build and test (docker) + run: | + docker build --tag whoogle-search:test . + docker run --publish 5000:5000 --detach --name whoogle-search-nocompose whoogle-search:test + sleep 15 + docker exec whoogle-search-nocompose curl -f http://localhost:5000/healthz || exit 1 + - name: build and test (docker compose) + run: | + docker rm -f whoogle-search-nocompose + WHOOGLE_IMAGE="whoogle-search:test" docker compose up --detach + sleep 15 + docker exec whoogle-search curl -f http://localhost:5000/healthz || exit 1 diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml new file mode 100644 index 0000000..80486f2 --- /dev/null +++ b/.github/workflows/pypi.yml @@ -0,0 +1,67 @@ +name: pypi + +on: + push: + branches: main + tags: v* + +jobs: + publish-test: + name: Build and publish to TestPyPI + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: 3.9 + - name: Install pypa/build + run: >- + python -m + pip install + build + setuptools + --user + - name: Set dev timestamp + run: echo "DEV_BUILD=$(date +%s)" >> $GITHUB_ENV + - name: Build binary wheel and source tarball + run: >- + python -m + build + --sdist + --wheel + --outdir dist/ + . + - name: Publish distribution to TestPyPI + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.TEST_PYPI_API_TOKEN }} + repository_url: https://test.pypi.org/legacy/ + publish: + name: Build and publish to PyPI + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python 3.9 + uses: actions/setup-python@v5 + with: + python-version: 3.9 + - name: Install pypa/build + run: >- + python -m + pip install + build + --user + - name: Build binary wheel and source tarball + run: >- + python -m + build + --sdist + --wheel + --outdir dist/ + . + - name: Publish distribution to PyPI + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/.github/workflows/scan.yml b/.github/workflows/scan.yml new file mode 100644 index 0000000..8790e89 --- /dev/null +++ b/.github/workflows/scan.yml @@ -0,0 +1,19 @@ +name: scan + +on: + schedule: + - cron: '0 0 * * *' + +jobs: + scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build the container image + run: | + docker build --tag whoogle-search:test . + - name: Initiate grype scan + run: | + curl -sSfL https://raw.githubusercontent.com/anchore/grype/main/install.sh | sh -s -- -b . + chmod +x ./grype + ./grype whoogle-search:test --only-fixed diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..97573fc --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,17 @@ +name: tests + +on: [push, pull_request] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install dependencies + run: pip install --upgrade pip && pip install -r requirements.txt + - name: Run tests + run: ./run test diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6b3be3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +venv/ +.venv/ +.idea/ +__pycache__/ +*.pyc +*.pem +*.conf +*.key +config.json +test/static +flask_session/ +app/static/config +app/static/custom_config +app/static/bangs/* +!app/static/bangs/00-whoogle.json + +# pip stuff +/build/ +dist/ +*.egg-info/ + +# env +whoogle.env + +# vim +*~ +*.swp diff --git a/.replit b/.replit new file mode 100644 index 0000000..d1c9f6f --- /dev/null +++ b/.replit @@ -0,0 +1 @@ +entrypoint = "misc/replit.py" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..dcaf27c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,103 @@ +FROM python:3.12.6-alpine3.20 AS builder + +RUN apk --update add \ + build-base \ + libxml2-dev \ + libxslt-dev \ + openssl-dev \ + libffi-dev + +COPY requirements.txt . + +RUN pip install --upgrade pip +RUN pip install --prefix /install --no-warn-script-location --no-cache-dir -r requirements.txt + +FROM python:3.12.6-alpine3.20 + +RUN apk add --update --no-cache tor curl openrc libstdc++ +# git go //for obfs4proxy +# libcurl4-openssl-dev + +RUN apk -U upgrade + +# uncomment to build obfs4proxy +# RUN git clone https://gitlab.com/yawning/obfs4.git +# WORKDIR /obfs4 +# RUN go build -o obfs4proxy/obfs4proxy ./obfs4proxy +# RUN cp ./obfs4proxy/obfs4proxy /usr/bin/obfs4proxy + +ARG DOCKER_USER=whoogle +ARG DOCKER_USERID=927 +ARG config_dir=/config +RUN mkdir -p $config_dir +RUN chmod a+w $config_dir +VOLUME $config_dir + +ARG url_prefix='' +ARG username='' +ARG password='' +ARG proxyuser='' +ARG proxypass='' +ARG proxytype='' +ARG proxyloc='' +ARG whoogle_dotenv='' +ARG use_https='' +ARG whoogle_port=5000 +ARG twitter_alt='farside.link/nitter' +ARG youtube_alt='farside.link/invidious' +ARG reddit_alt='farside.link/libreddit' +ARG medium_alt='farside.link/scribe' +ARG translate_alt='farside.link/lingva' +ARG imgur_alt='farside.link/rimgo' +ARG wikipedia_alt='farside.link/wikiless' +ARG imdb_alt='farside.link/libremdb' +ARG quora_alt='farside.link/quetre' +ARG so_alt='farside.link/anonymousoverflow' + +ENV CONFIG_VOLUME=$config_dir \ + WHOOGLE_URL_PREFIX=$url_prefix \ + WHOOGLE_USER=$username \ + WHOOGLE_PASS=$password \ + WHOOGLE_PROXY_USER=$proxyuser \ + WHOOGLE_PROXY_PASS=$proxypass \ + WHOOGLE_PROXY_TYPE=$proxytype \ + WHOOGLE_PROXY_LOC=$proxyloc \ + WHOOGLE_DOTENV=$whoogle_dotenv \ + HTTPS_ONLY=$use_https \ + EXPOSE_PORT=$whoogle_port \ + WHOOGLE_ALT_TW=$twitter_alt \ + WHOOGLE_ALT_YT=$youtube_alt \ + WHOOGLE_ALT_RD=$reddit_alt \ + WHOOGLE_ALT_MD=$medium_alt \ + WHOOGLE_ALT_TL=$translate_alt \ + WHOOGLE_ALT_IMG=$imgur_alt \ + WHOOGLE_ALT_WIKI=$wikipedia_alt \ + WHOOGLE_ALT_IMDB=$imdb_alt \ + WHOOGLE_ALT_QUORA=$quora_alt \ + WHOOGLE_ALT_SO=$so_alt + +WORKDIR /whoogle + +COPY --from=builder /install /usr/local +COPY misc/tor/torrc /etc/tor/torrc +COPY misc/tor/start-tor.sh misc/tor/start-tor.sh +COPY app/ app/ +COPY run whoogle.env* ./ + +# Create user/group to run as +RUN adduser -D -g $DOCKER_USERID -u $DOCKER_USERID $DOCKER_USER + +# Fix ownership / permissions +RUN chown -R ${DOCKER_USER}:${DOCKER_USER} /whoogle /var/lib/tor + +# Allow writing symlinks to build dir +RUN chown $DOCKER_USERID:$DOCKER_USERID app/static/build + +USER $DOCKER_USER:$DOCKER_USER + +EXPOSE $EXPOSE_PORT + +HEALTHCHECK --interval=30s --timeout=5s \ + CMD curl -f http://localhost:${EXPOSE_PORT}/healthz || exit 1 + +CMD misc/tor/start-tor.sh & ./run diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..c8b71df --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Ben Busby + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..c853358 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,6 @@ +graft app/static +graft app/templates +graft app/misc +include requirements.txt +recursive-include test +global-exclude *.pyc diff --git a/app.json b/app.json new file mode 100644 index 0000000..6a9b7e8 --- /dev/null +++ b/app.json @@ -0,0 +1,194 @@ +{ + "name": "Whoogle Search", + "description": "A lightweight, privacy-oriented, containerized Google search proxy for desktop/mobile that removes Javascript, AMP links, tracking, and ads/sponsored content", + "repository": "https://github.com/benbusby/whoogle-search", + "logo": "https://raw.githubusercontent.com/benbusby/whoogle-search/master/app/static/img/favicon/ms-icon-150x150.png", + "keywords": [ + "search", + "metasearch", + "flask", + "docker", + "heroku", + "adblock", + "degoogle", + "privacy" + ], + "stack": "container", + "env": { + "WHOOGLE_URL_PREFIX": { + "description": "The URL prefix to use for the whoogle instance (i.e. \"/whoogle\")", + "value": "", + "required": false + }, + "WHOOGLE_USER": { + "description": "The username for basic auth. WHOOGLE_PASS must also be set if used. Leave empty to disable.", + "value": "", + "required": false + }, + "WHOOGLE_PASS": { + "description": "The password for basic auth. WHOOGLE_USER must also be set if used. Leave empty to disable.", + "value": "", + "required": false + }, + "WHOOGLE_PROXY_USER": { + "description": "The username of the proxy server. Leave empty to disable.", + "value": "", + "required": false + }, + "WHOOGLE_PROXY_PASS": { + "description": "The password of the proxy server. Leave empty to disable.", + "value": "", + "required": false + }, + "WHOOGLE_PROXY_TYPE": { + "description": "The type of the proxy server. For example \"socks5\". Leave empty to disable.", + "value": "", + "required": false + }, + "WHOOGLE_PROXY_LOC": { + "description": "The location of the proxy server (host or ip). Leave empty to disable.", + "value": "", + "required": false + }, + "WHOOGLE_ALT_TW": { + "description": "The site to use as a replacement for twitter.com when site alternatives are enabled in the config.", + "value": "farside.link/nitter", + "required": false + }, + "WHOOGLE_ALT_YT": { + "description": "The site to use as a replacement for youtube.com when site alternatives are enabled in the config.", + "value": "farside.link/invidious", + "required": false + }, + "WHOOGLE_ALT_RD": { + "description": "The site to use as a replacement for reddit.com when site alternatives are enabled in the config.", + "value": "farside.link/libreddit", + "required": false + }, + "WHOOGLE_ALT_MD": { + "description": "The site to use as a replacement for medium.com when site alternatives are enabled in the config.", + "value": "farside.link/scribe", + "required": false + }, + "WHOOGLE_ALT_TL": { + "description": "The Google Translate alternative to use for all searches following the 'translate ___' structure.", + "value": "farside.link/lingva", + "required": false + }, + "WHOOGLE_ALT_IMG": { + "description": "The site to use as a replacement for imgur.com when site alternatives are enabled in the config.", + "value": "farside.link/rimgo", + "required": false + }, + "WHOOGLE_ALT_WIKI": { + "description": "The site to use as a replacement for wikipedia.com when site alternatives are enabled in the config.", + "value": "farside.link/wikiless", + "required": false + }, + "WHOOGLE_ALT_IMDB": { + "description": "The site to use as a replacement for imdb.com when site alternatives are enabled in the config.", + "value": "farside.link/libremdb", + "required": false + }, + "WHOOGLE_ALT_QUORA": { + "description": "The site to use as a replacement for quora.com when site alternatives are enabled in the config.", + "value": "farside.link/quetre", + "required": false + }, + "WHOOGLE_ALT_SO": { + "description": "The site to use as a replacement for stackoverflow.com when site alternatives are enabled in the config.", + "value": "farside.link/anonymousoverflow", + "required": false + }, + "WHOOGLE_MINIMAL": { + "description": "Remove everything except basic result cards from all search queries (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_COUNTRY": { + "description": "[CONFIG] The country to use for restricting search results (use values from https://raw.githubusercontent.com/benbusby/whoogle-search/develop/app/static/settings/countries.json)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_TIME_PERIOD" : { + "description": "[CONFIG] The time period to use for restricting search results", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_LANGUAGE": { + "description": "[CONFIG] The language to use for the interface (use values from https://raw.githubusercontent.com/benbusby/whoogle-search/develop/app/static/settings/languages.json)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_SEARCH_LANGUAGE": { + "description": "[CONFIG] The language to use for search results (use values from https://raw.githubusercontent.com/benbusby/whoogle-search/develop/app/static/settings/languages.json)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_DISABLE": { + "description": "[CONFIG] Disable ability for client to change config (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_BLOCK": { + "description": "[CONFIG] Block websites from search results (comma-separated list)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_THEME": { + "description": "[CONFIG] Set theme to 'dark', 'light', or 'system'", + "value": "system", + "required": false + }, + "WHOOGLE_CONFIG_SAFE": { + "description": "[CONFIG] Use safe mode for searches (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_ALTS": { + "description": "[CONFIG] Use social media alternatives (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_NEAR": { + "description": "[CONFIG] Restrict results to only those near a particular city", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_TOR": { + "description": "[CONFIG] Use Tor, if available (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_NEW_TAB": { + "description": "[CONFIG] Always open results in new tab (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_VIEW_IMAGE": { + "description": "[CONFIG] Enable View Image option (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_GET_ONLY": { + "description": "[CONFIG] Search using GET requests only (set to 1 or leave blank)", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_STYLE": { + "description": "[CONFIG] Custom CSS styling (paste in CSS or leave blank)", + "value": ":root { /* LIGHT THEME COLORS */ --whoogle-background: #d8dee9; --whoogle-accent: #2e3440; --whoogle-text: #3B4252; --whoogle-contrast-text: #eceff4; --whoogle-secondary-text: #70757a; --whoogle-result-bg: #fff; --whoogle-result-title: #4c566a; --whoogle-result-url: #81a1c1; --whoogle-result-visited: #a3be8c; /* DARK THEME COLORS */ --whoogle-dark-background: #222; --whoogle-dark-accent: #685e79; --whoogle-dark-text: #fff; --whoogle-dark-contrast-text: #000; --whoogle-dark-secondary-text: #bbb; --whoogle-dark-result-bg: #000; --whoogle-dark-result-title: #1967d2; --whoogle-dark-result-url: #4b11a8; --whoogle-dark-result-visited: #bbbbff; }", + "required": false + }, + "WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED": { + "description": "[CONFIG] Encrypt preferences token, requires WHOOGLE_CONFIG_PREFERENCES_KEY to be set", + "value": "", + "required": false + }, + "WHOOGLE_CONFIG_PREFERENCES_KEY": { + "description": "[CONFIG] Key to encrypt preferences", + "value": "NEEDS_TO_BE_MODIFIED", + "required": false + } + } +} diff --git a/app/.DS_Store b/app/.DS_Store new file mode 100644 index 0000000..d96c6e6 Binary files /dev/null and b/app/.DS_Store differ diff --git a/app/__init__.py b/app/__init__.py new file mode 100755 index 0000000..5a10faf --- /dev/null +++ b/app/__init__.py @@ -0,0 +1,199 @@ +from app.filter import clean_query +from app.request import send_tor_signal +from app.utils.session import generate_key +from app.utils.bangs import gen_bangs_json, load_all_bangs +from app.utils.misc import gen_file_hash, read_config_bool +from base64 import b64encode +from bs4 import MarkupResemblesLocatorWarning +from datetime import datetime, timedelta +from dotenv import load_dotenv +from flask import Flask +import json +import logging.config +import os +from stem import Signal +import threading +import warnings + +from werkzeug.middleware.proxy_fix import ProxyFix + +from app.utils.misc import read_config_bool +from app.version import __version__ + +app = Flask(__name__, static_folder=os.path.dirname( + os.path.abspath(__file__)) + '/static') + +app.wsgi_app = ProxyFix(app.wsgi_app) + +# look for WHOOGLE_ENV, else look in parent directory +dot_env_path = os.getenv( + "WHOOGLE_DOTENV_PATH", + os.path.join(os.path.dirname(os.path.abspath(__file__)), "../whoogle.env")) + +# Load .env file if enabled +if os.path.exists(dot_env_path): + load_dotenv(dot_env_path) + +app.enc_key = generate_key() + +if read_config_bool('HTTPS_ONLY'): + app.config['SESSION_COOKIE_NAME'] = '__Secure-session' + app.config['SESSION_COOKIE_SECURE'] = True + +app.config['VERSION_NUMBER'] = __version__ +app.config['APP_ROOT'] = os.getenv( + 'APP_ROOT', + os.path.dirname(os.path.abspath(__file__))) +app.config['STATIC_FOLDER'] = os.getenv( + 'STATIC_FOLDER', + os.path.join(app.config['APP_ROOT'], 'static')) +app.config['BUILD_FOLDER'] = os.path.join( + app.config['STATIC_FOLDER'], 'build') +app.config['CACHE_BUSTING_MAP'] = {} +app.config['LANGUAGES'] = json.load(open( + os.path.join(app.config['STATIC_FOLDER'], 'settings/languages.json'), + encoding='utf-8')) +app.config['COUNTRIES'] = json.load(open( + os.path.join(app.config['STATIC_FOLDER'], 'settings/countries.json'), + encoding='utf-8')) +app.config['TIME_PERIODS'] = json.load(open( + os.path.join(app.config['STATIC_FOLDER'], 'settings/time_periods.json'), + encoding='utf-8')) +app.config['TRANSLATIONS'] = json.load(open( + os.path.join(app.config['STATIC_FOLDER'], 'settings/translations.json'), + encoding='utf-8')) +app.config['THEMES'] = json.load(open( + os.path.join(app.config['STATIC_FOLDER'], 'settings/themes.json'), + encoding='utf-8')) +app.config['HEADER_TABS'] = json.load(open( + os.path.join(app.config['STATIC_FOLDER'], 'settings/header_tabs.json'), + encoding='utf-8')) +app.config['CONFIG_PATH'] = os.getenv( + 'CONFIG_VOLUME', + os.path.join(app.config['STATIC_FOLDER'], 'config')) +app.config['DEFAULT_CONFIG'] = os.path.join( + app.config['CONFIG_PATH'], + 'config.json') +app.config['CONFIG_DISABLE'] = read_config_bool('WHOOGLE_CONFIG_DISABLE') +app.config['SESSION_FILE_DIR'] = os.path.join( + app.config['CONFIG_PATH'], + 'session') +app.config['MAX_SESSION_SIZE'] = 4000 # Sessions won't exceed 4KB +app.config['BANG_PATH'] = os.getenv( + 'CONFIG_VOLUME', + os.path.join(app.config['STATIC_FOLDER'], 'bangs')) +app.config['BANG_FILE'] = os.path.join( + app.config['BANG_PATH'], + 'bangs.json') + +# Ensure all necessary directories exist +if not os.path.exists(app.config['CONFIG_PATH']): + os.makedirs(app.config['CONFIG_PATH']) + +if not os.path.exists(app.config['SESSION_FILE_DIR']): + os.makedirs(app.config['SESSION_FILE_DIR']) + +if not os.path.exists(app.config['BANG_PATH']): + os.makedirs(app.config['BANG_PATH']) + +if not os.path.exists(app.config['BUILD_FOLDER']): + os.makedirs(app.config['BUILD_FOLDER']) + +# Session values +app_key_path = os.path.join(app.config['CONFIG_PATH'], 'whoogle.key') +if os.path.exists(app_key_path): + try: + app.config['SECRET_KEY'] = open(app_key_path, 'r').read() + except PermissionError: + app.config['SECRET_KEY'] = str(b64encode(os.urandom(32))) +else: + app.config['SECRET_KEY'] = str(b64encode(os.urandom(32))) + with open(app_key_path, 'w') as key_file: + key_file.write(app.config['SECRET_KEY']) + key_file.close() +app.config['PERMANENT_SESSION_LIFETIME'] = timedelta(days=365) + +# NOTE: SESSION_COOKIE_SAMESITE must be set to 'lax' to allow the user's +# previous session to persist when accessing the instance from an external +# link. Setting this value to 'strict' causes Whoogle to revalidate a new +# session, and fail, resulting in cookies being disabled. +app.config['SESSION_COOKIE_SAMESITE'] = 'Strict' + +# Config fields that are used to check for updates +app.config['RELEASES_URL'] = 'https://github.com/' \ + 'benbusby/whoogle-search/releases' +app.config['LAST_UPDATE_CHECK'] = datetime.now() - timedelta(hours=24) +app.config['HAS_UPDATE'] = '' + +# The alternative to Google Translate is treated a bit differently than other +# social media site alternatives, in that it is used for any translation +# related searches. +translate_url = os.getenv('WHOOGLE_ALT_TL', 'https://farside.link/lingva') +if not translate_url.startswith('http'): + translate_url = 'https://' + translate_url +app.config['TRANSLATE_URL'] = translate_url + +app.config['CSP'] = 'default-src \'none\';' \ + 'frame-src ' + translate_url + ';' \ + 'manifest-src \'self\';' \ + 'img-src \'self\' data:;' \ + 'style-src \'self\' \'unsafe-inline\';' \ + 'script-src \'self\';' \ + 'media-src \'self\';' \ + 'connect-src \'self\';' + +# Generate DDG bang filter +generating_bangs = False +if not os.path.exists(app.config['BANG_FILE']): + generating_bangs = True + json.dump({}, open(app.config['BANG_FILE'], 'w')) + bangs_thread = threading.Thread( + target=gen_bangs_json, + args=(app.config['BANG_FILE'],)) + bangs_thread.start() + +# Build new mapping of static files for cache busting +cache_busting_dirs = ['css', 'js'] +for cb_dir in cache_busting_dirs: + full_cb_dir = os.path.join(app.config['STATIC_FOLDER'], cb_dir) + for cb_file in os.listdir(full_cb_dir): + # Create hash from current file state + full_cb_path = os.path.join(full_cb_dir, cb_file) + cb_file_link = gen_file_hash(full_cb_dir, cb_file) + build_path = os.path.join(app.config['BUILD_FOLDER'], cb_file_link) + + try: + os.symlink(full_cb_path, build_path) + except FileExistsError: + # Symlink hasn't changed, ignore + pass + + # Create mapping for relative path urls + map_path = build_path.replace(app.config['APP_ROOT'], '') + if map_path.startswith('/'): + map_path = map_path[1:] + app.config['CACHE_BUSTING_MAP'][cb_file] = map_path + +# Templating functions +app.jinja_env.globals.update(clean_query=clean_query) +app.jinja_env.globals.update( + cb_url=lambda f: app.config['CACHE_BUSTING_MAP'][f.lower()]) + +# Attempt to acquire tor identity, to determine if Tor config is available +send_tor_signal(Signal.HEARTBEAT) + +# Suppress spurious warnings from BeautifulSoup +warnings.simplefilter('ignore', MarkupResemblesLocatorWarning) + +from app import routes # noqa + +# The gen_bangs_json function takes care of loading bangs, so skip it here if +# it's already being loaded +if not generating_bangs: + load_all_bangs(app.config['BANG_FILE']) + +# Disable logging from imported modules +logging.config.dictConfig({ + 'version': 1, + 'disable_existing_loggers': True, +}) diff --git a/app/__main__.py b/app/__main__.py new file mode 100755 index 0000000..03a424c --- /dev/null +++ b/app/__main__.py @@ -0,0 +1,3 @@ +from .routes import run_app + +run_app() diff --git a/app/filter.py b/app/filter.py new file mode 100755 index 0000000..68be403 --- /dev/null +++ b/app/filter.py @@ -0,0 +1,785 @@ +import cssutils +from bs4 import BeautifulSoup +from bs4.element import ResultSet, Tag +from cryptography.fernet import Fernet +from flask import render_template +import html +import urllib.parse as urlparse +from urllib.parse import parse_qs +import re + +from app.models.g_classes import GClasses +from app.request import VALID_PARAMS, MAPS_URL +from app.utils.misc import get_abs_url, read_config_bool +from app.utils.results import ( + BLANK_B64, GOOG_IMG, GOOG_STATIC, G_M_LOGO_URL, LOGO_URL, SITE_ALTS, + has_ad_content, filter_link_args, append_anon_view, get_site_alt, +) +from app.models.endpoint import Endpoint +from app.models.config import Config + + +MAPS_ARGS = ['q', 'daddr'] + +minimal_mode_sections = ['Top stories', 'Images', 'People also ask'] +unsupported_g_pages = [ + 'google.com/aclk' + '*.googleapis.com' + '*.gstatic.com' + '*.google-analytics.com' + 'adservice.google.com' + 'support.google.com', + 'accounts.google.com', + 'policies.google.com', + 'google.com/preferences', + 'google.com/intl', + 'advanced_search', + 'tbm=shop', + 'ageverification.google.co.kr' +] + +unsupported_g_divs = [ + 'google.com/preferences?hl=', + 'ageverification.google.co.kr' + 'google.com/aclk?sa=' +] + + +def extract_q(q_str: str, href: str) -> str: + """Extracts the 'q' element from a result link. This is typically + either the link to a result's website, or a string. + + Args: + q_str: The result link to parse + href: The full url to check for standalone 'q' elements first, + rather than parsing the whole query string and then checking. + + Returns: + str: The 'q' element of the link, or an empty string + """ + return parse_qs(q_str, keep_blank_values=True)['q'][0] if ('&q=' in href or '?q=' in href) else '' + + +def build_map_url(href: str) -> str: + """Tries to extract known args that explain the location in the url. If a + location is found, returns the default url with it. Otherwise, returns the + url unchanged. + + Args: + href: The full url to check. + + Returns: + str: The parsed url, or the url unchanged. + """ + # parse the url + parsed_url = parse_qs(href) + # iterate through the known parameters and try build the url + for param in MAPS_ARGS: + if param in parsed_url: + return MAPS_URL + "?q=" + parsed_url[param][0] + + # query could not be extracted returning unchanged url + return href + + +def clean_query(query: str) -> str: + """Strips the blocked site list from the query, if one is being + used. + + Args: + query: The query string + + Returns: + str: The query string without any "-site:..." filters + """ + return query[:query.find('-site:')] if '-site:' in query else query + + +def clean_css(css: str, page_url: str) -> str: + """Removes all remote URLs from a CSS string. + + Args: + css: The CSS string + + Returns: + str: The filtered CSS, with URLs proxied through Whoogle + """ + sheet = cssutils.parseString(css) + urls = cssutils.getUrls(sheet) + + for url in urls: + abs_url = get_abs_url(url, page_url) + if abs_url.startswith('data:'): + continue + css = css.replace( + url, + f'{Endpoint.element}?type=image/png&url={abs_url}' + ) + + return css + + +class Filter: + # Limit used for determining if a result is a "regular" result or a list + # type result (such as "people also asked", "related searches", etc) + RESULT_CHILD_LIMIT = 7 + + def __init__( + self, + user_key: str, + config: Config, + root_url='', + page_url='', + query='', + mobile=False) -> None: + self.soup = None + self.config = config + self.mobile = mobile + self.user_key = user_key + self.page_url = page_url + self.query = query + self.main_divs = ResultSet('') + self._elements = 0 + self._av = set() + + self.root_url = root_url[:-1] if root_url.endswith('/') else root_url + + def __getitem__(self, name): + return getattr(self, name) + + @property + def elements(self): + return self._elements + + def encrypt_path(self, path, is_element=False) -> str: + # Encrypts path to avoid plaintext results in logs + if is_element: + # Element paths are encrypted separately from text, to allow key + # regeneration once all items have been served to the user + enc_path = Fernet(self.user_key).encrypt(path.encode()).decode() + self._elements += 1 + return enc_path + + return Fernet(self.user_key).encrypt(path.encode()).decode() + + def clean(self, soup) -> BeautifulSoup: + self.soup = soup + self.main_divs = self.soup.find('div', {'id': 'main'}) + self.remove_ads() + self.remove_block_titles() + self.remove_block_url() + self.collapse_sections() + self.update_css() + self.update_styling() + self.remove_block_tabs() + + # self.main_divs is only populated for the main page of search results + # (i.e. not images/news/etc). + if self.main_divs: + for div in self.main_divs: + self.sanitize_div(div) + + for img in [_ for _ in self.soup.find_all('img') if 'src' in _.attrs]: + self.update_element_src(img, 'image/png') + + for audio in [_ for _ in self.soup.find_all('audio') if 'src' in _.attrs]: + self.update_element_src(audio, 'audio/mpeg') + audio['controls'] = '' + + for link in self.soup.find_all('a', href=True): + self.update_link(link) + self.add_favicon(link) + + if self.config.alts: + self.site_alt_swap() + + input_form = self.soup.find('form') + if input_form is not None: + input_form['method'] = 'GET' if self.config.get_only else 'POST' + # Use a relative URI for submissions + input_form['action'] = 'search' + + # Ensure no extra scripts passed through + for script in self.soup('script'): + script.decompose() + + # Update default footer and header + footer = self.soup.find('footer') + if footer: + # Remove divs that have multiple links beyond just page navigation + [_.decompose() for _ in footer.find_all('div', recursive=False) + if len(_.find_all('a', href=True)) > 3] + for link in footer.find_all('a', href=True): + link['href'] = f'{link["href"]}&preferences={self.config.preferences}' + + header = self.soup.find('header') + if header: + header.decompose() + self.remove_site_blocks(self.soup) + return self.soup + + def sanitize_div(self, div) -> None: + """Removes escaped script and iframe tags from results + + Returns: + None (The soup object is modified directly) + """ + if not div: + return + + for d in div.find_all('div', recursive=True): + d_text = d.find(text=True, recursive=False) + + # Ensure we're working with tags that contain text content + if not d_text or not d.string: + continue + + d.string = html.unescape(d_text) + div_soup = BeautifulSoup(d.string, 'html.parser') + + # Remove all valid script or iframe tags in the div + for script in div_soup.find_all('script'): + script.decompose() + + for iframe in div_soup.find_all('iframe'): + iframe.decompose() + + d.string = str(div_soup) + + def add_favicon(self, link) -> None: + """Adds icons for each returned result, using the result site's favicon + + Returns: + None (The soup object is modified directly) + """ + # Skip empty, parentless, or internal links + show_favicons = read_config_bool('WHOOGLE_SHOW_FAVICONS', True) + is_valid_link = link and link.parent and link['href'].startswith('http') + if not show_favicons or not is_valid_link: + return + + parent = link.parent + is_result_div = False + + # Check each parent to make sure that the div doesn't already have a + # favicon attached, and that the div is a result div + while parent: + p_cls = parent.attrs.get('class') or [] + if 'has-favicon' in p_cls or GClasses.scroller_class in p_cls: + return + elif GClasses.result_class_a not in p_cls: + parent = parent.parent + else: + is_result_div = True + break + + if not is_result_div: + return + + # Construct the html for inserting the icon into the parent div + parsed = urlparse.urlparse(link['href']) + favicon = self.encrypt_path( + f'{parsed.scheme}://{parsed.netloc}/favicon.ico', + is_element=True) + src = f'{self.root_url}/{Endpoint.element}?url={favicon}' + \ + '&type=image/x-icon' + html = f'' + + favicon = BeautifulSoup(html, 'html.parser') + link.parent.insert(0, favicon) + + # Update all parents to indicate that a favicon has been attached + parent = link.parent + while parent: + p_cls = parent.get('class') or [] + p_cls.append('has-favicon') + parent['class'] = p_cls + parent = parent.parent + + if GClasses.result_class_a in p_cls: + break + + def remove_site_blocks(self, soup) -> None: + if not self.config.block or not soup.body: + return + search_string = ' '.join(['-site:' + + _ for _ in self.config.block.split(',')]) + selected = soup.body.findAll(text=re.compile(search_string)) + + for result in selected: + result.string.replace_with(result.string.replace( + search_string, '')) + + def remove_ads(self) -> None: + """Removes ads found in the list of search result divs + + Returns: + None (The soup object is modified directly) + """ + if not self.main_divs: + return + + for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: + div_ads = [_ for _ in div.find_all('span', recursive=True) + if has_ad_content(_.text)] + _ = div.decompose() if len(div_ads) else None + + def remove_block_titles(self) -> None: + if not self.main_divs or not self.config.block_title: + return + block_title = re.compile(self.config.block_title) + for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: + block_divs = [_ for _ in div.find_all('h3', recursive=True) + if block_title.search(_.text) is not None] + _ = div.decompose() if len(block_divs) else None + + def remove_block_url(self) -> None: + if not self.main_divs or not self.config.block_url: + return + block_url = re.compile(self.config.block_url) + for div in [_ for _ in self.main_divs.find_all('div', recursive=True)]: + block_divs = [_ for _ in div.find_all('a', recursive=True) + if block_url.search(_.attrs['href']) is not None] + _ = div.decompose() if len(block_divs) else None + + def remove_block_tabs(self) -> None: + if self.main_divs: + for div in self.main_divs.find_all( + 'div', + attrs={'class': f'{GClasses.main_tbm_tab}'} + ): + _ = div.decompose() + else: + # when in images tab + for div in self.soup.find_all( + 'div', + attrs={'class': f'{GClasses.images_tbm_tab}'} + ): + _ = div.decompose() + + def collapse_sections(self) -> None: + """Collapses long result sections ("people also asked", "related + searches", etc) into "details" elements + + These sections are typically the only sections in the results page that + have more than ~5 child divs within a primary result div. + + Returns: + None (The soup object is modified directly) + """ + minimal_mode = read_config_bool('WHOOGLE_MINIMAL') + + def pull_child_divs(result_div: BeautifulSoup): + try: + return result_div.findChildren( + 'div', recursive=False + )[0].findChildren( + 'div', recursive=False) + except IndexError: + return [] + + if not self.main_divs: + return + #töörölni kell People also ask, + search_terms = ["People also search for", "Related searches", "Kapcsolódó keresések", "Mások ezeket keresték még"] + details_list = [] + + # Loop through results and check for the number of child divs in each + for result in self.main_divs.find_all(): + result_children = pull_child_divs(result) + if minimal_mode: + if any(f">{x} 1: + subtitle = ' (' + \ + ''.join(content[1:]) + ')' + elem.decompose() + break + + # Determine the class based on the label content + if any(term in label for term in search_terms): + details_class = 'search-recommendations' + details_attrs = {'class': details_class, 'open': 'true'} + else: + details_class = 'other-results' + details_attrs = {'class': details_class} + + + # Create the new details element to wrap around the result's + # first parent + parent = None + idx = 0 + while not parent and idx < len(result_children): + parent = result_children[idx].parent + idx += 1 + + details = BeautifulSoup(features='html.parser').new_tag('details', attrs=details_attrs) + summary = BeautifulSoup(features='html.parser').new_tag('summary', attrs={'class': "summary_div"}) + summary.string = label + + if subtitle: + soup = BeautifulSoup(subtitle, 'html.parser') + summary.append(soup) + + details.append(summary) + + if parent and not minimal_mode: + parent.wrap(details) + elif parent and minimal_mode: + # Remove parent element from document if "minimal mode" is + # enabled + parent.decompose() + + for details in details_list: + self.main_divs.append(details) + + def update_element_src(self, element: Tag, mime: str, attr='src') -> None: + """Encrypts the original src of an element and rewrites the element src + to use the "/element?src=" pass-through. + + Returns: + None (The soup element is modified directly) + + """ + src = element[attr].split(' ')[0] + + if src.startswith('//'): + src = 'https:' + src + elif src.startswith('data:'): + return + + if src.startswith(LOGO_URL): + # Re-brand with Whoogle logo + element.replace_with(BeautifulSoup( + render_template('logo.html'), + features='html.parser')) + return + elif src.startswith(G_M_LOGO_URL): + # Re-brand with single-letter Whoogle logo + element['src'] = 'static/img/favicon/apple-icon.png' + element.parent['href'] = 'home' + return + elif src.startswith(GOOG_IMG) or GOOG_STATIC in src: + element['src'] = BLANK_B64 + return + + element[attr] = f'{self.root_url}/{Endpoint.element}?url=' + ( + self.encrypt_path( + src, + is_element=True + ) + '&type=' + urlparse.quote(mime) + ) + + def update_css(self) -> None: + """Updates URLs used in inline styles to be proxied by Whoogle + using the /element endpoint. + + Returns: + None (The soup element is modified directly) + + """ + # Filter all +

+
+

0

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ diff --git a/app/templates/display.html b/app/templates/display.html new file mode 100755 index 0000000..a09a58e --- /dev/null +++ b/app/templates/display.html @@ -0,0 +1,40 @@ + + + + + {% if not search_type %} + + {% else %} + + {% endif %} + + + + + + + + + + {{ clean_query(query) }} - RaveSearch + + +{{ search_header|safe }} +{% if is_translation %} + +{% endif %} +{{ response|safe }} + +{% include 'footer.html' %} +{% if autocomplete_enabled == '1' %} + +{% endif %} + + + + diff --git a/app/templates/error.html b/app/templates/error.html new file mode 100755 index 0000000..dcaf3cf --- /dev/null +++ b/app/templates/error.html @@ -0,0 +1,106 @@ +{% if config.theme %} + {% if config.theme == 'system' %} + + {% else %} + + {% endif %} +{% else %} + +{% endif %} + + + +
+

Error

+

+ {{ error_message }} +

+
+ {% if query and translation %} +

+

{{ translation['continue-search'] }}

+ +
+

Other options:

+ +
+

+ {% endif %} + Return Home +
diff --git a/app/templates/footer.html b/app/templates/footer.html new file mode 100755 index 0000000..821a67e --- /dev/null +++ b/app/templates/footer.html @@ -0,0 +1,18 @@ + + + diff --git a/app/templates/header.html b/app/templates/header.html new file mode 100755 index 0000000..b488b39 --- /dev/null +++ b/app/templates/header.html @@ -0,0 +1,89 @@ + +
+ + + + +
+
+ {% for tab_id, tab_content in tabs.items() %} + {% if tab_content['selected'] %} + + {% else %} + + {% endif %} + {% endfor %} + + +
+
+
+
+ +
+
+ + +

+ + +
+
+ + diff --git a/app/templates/header2.html b/app/templates/header2.html new file mode 100755 index 0000000..959adc0 --- /dev/null +++ b/app/templates/header2.html @@ -0,0 +1,210 @@ +{% if mobile %} +
+
+
+ +
+
+ {% if config.preferences %} + + {% endif %} + + + + + +
+
+
+
+
+
+
+
+
+
+ {% for tab_id, tab_content in tabs.items() %} + {% if tab_content['selected'] %} + {{ tab_content['name'] }} + {% else %} + {{ tab_content['name'] }} + {% endif %} + {% endfor %} + + +
+
+
+
+
+
+
+
+{% else %} +
+ +
+
+
+
+ {% if config.preferences %} + + {% endif %} + + + + + +
+
+
+
+
+
+
+
+
+
+
+ {% for tab_id, tab_content in tabs.items() %} + {% if tab_content['selected'] %} + {{ tab_content['name'] }} + {% else %} + {{ tab_content['name'] }} + {% endif %} + {% endfor %} + + +
+
+
+
+
+
+{% endif %} +
+
+ + +
+ + +
+
+ + + + + + + + + +
+ +
+ {% set display_tooltip = true %} + {% include 'simple/categories.html' %} +
+ +
+ {% include 'simple/filters/languages.html' %} + {% include 'simple/filters/time_range.html' %} + {% include 'simple/filters/safesearch.html' %} +
+ + {% if timeout_limit %}{% endif %} + + + + +
{{- '' -}} +
+ {%- if not search_on_category_select or not display_tooltip -%} + {%- for category in categories_as_tabs -%} +
{{- '' -}} + + +
+ {%- endfor -%} + {%- if display_tooltip %}
{{ _('Click on the magnifier to perform search') }}
{% endif -%} + {%- else -%} + {%- for category in categories_as_tabs -%}{{- '\n' -}} + {{- '' -}} + {%- endfor -%} + {{- '\n' -}} + {%- endif -%} +
{{- '' -}} +
diff --git a/app/templates/imageresults.html b/app/templates/imageresults.html new file mode 100755 index 0000000..2df3a76 --- /dev/null +++ b/app/templates/imageresults.html @@ -0,0 +1,451 @@ +
+ + +
+ {% for result in results %} + + {% endfor %} +
+ +
+ +
+
+ + +
+
+ +
+
+ +
\ No newline at end of file diff --git a/app/templates/index.html b/app/templates/index.html new file mode 100755 index 0000000..31d0ccc --- /dev/null +++ b/app/templates/index.html @@ -0,0 +1,303 @@ + + + + + + + + + + + + + + {% if autocomplete_enabled == '1' %} + + {% endif %} + + + + + + + + + + RaveSearch + + +
+
+ {{ logo|safe }} +
+
+
+
+ {% if config.preferences %} + + {% endif %} + +
+ +
+ +
+ + {% if not config_disabled %} + +
+
+
+
+ + +
+ + +
+
+ + +
+ + + + + +
+ + +
+
+ + +
+ + + + + +
+ + +
+
+ + +
— {{ translation['config-alts-help'] }}
+
+
+ + +
+ +
+ + +
+
+ + +
+ +
+ + +
+
+ + {{ translation['config-css'] }}: + + +
+
+ + +
— {{ translation['config-pref-help'] }}
+ + +
+
+
+   +   + +
+
+
+
+ {% endif %} + + + +
+{% include 'footer.html' %} + + + diff --git a/app/templates/logo.html b/app/templates/logo.html new file mode 100755 index 0000000..af64c22 --- /dev/null +++ b/app/templates/logo.html @@ -0,0 +1,8 @@ + diff --git a/app/templates/opensearch.xml b/app/templates/opensearch.xml new file mode 100755 index 0000000..856a6db --- /dev/null +++ b/app/templates/opensearch.xml @@ -0,0 +1,25 @@ + + + {% if not search_type %} + RaveSearch + {% else %} + RaveSearch {{ search_name }} + {% endif %} + RaveSearch: A self-hosted, ad-free, privacy-respecting metasearch engine + UTF-8 + +  + + + + {% if search_type %} + + {% endif %} + + + + + {{ main_url }}/search + + diff --git a/app/templates/search.html b/app/templates/search.html new file mode 100755 index 0000000..634b707 --- /dev/null +++ b/app/templates/search.html @@ -0,0 +1,15 @@ +
+ + +
diff --git a/app/utils/.DS_Store b/app/utils/.DS_Store new file mode 100755 index 0000000..8942dfb Binary files /dev/null and b/app/utils/.DS_Store differ diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100755 index 0000000..e69de29 diff --git a/app/utils/bangs.py b/app/utils/bangs.py new file mode 100755 index 0000000..4e7a82f --- /dev/null +++ b/app/utils/bangs.py @@ -0,0 +1,146 @@ +import json +import requests +import urllib.parse as urlparse +import os +import glob + +bangs_dict = {} +DDG_BANGS = 'https://duckduckgo.com/bang.js' + + +def load_all_bangs(ddg_bangs_file: str, ddg_bangs: dict = {}): + """Loads all the bang files in alphabetical order + Args: + ddg_bangs_file: The str path to the new DDG bangs json file + ddg_bangs: The dict of ddg bangs. If this is empty, it will load the + bangs from the file + Returns: + None + """ + global bangs_dict + ddg_bangs_file = os.path.normpath(ddg_bangs_file) + + if (bangs_dict and not ddg_bangs) or os.path.getsize(ddg_bangs_file) <= 4: + return + + bangs = {} + bangs_dir = os.path.dirname(ddg_bangs_file) + bang_files = glob.glob(os.path.join(bangs_dir, '*.json')) + + # Normalize the paths + bang_files = [os.path.normpath(f) for f in bang_files] + + # Move the ddg bangs file to the beginning + bang_files = sorted([f for f in bang_files if f != ddg_bangs_file]) + + if ddg_bangs: + bangs |= ddg_bangs + else: + bang_files.insert(0, ddg_bangs_file) + + for i, bang_file in enumerate(bang_files): + try: + bangs |= json.load(open(bang_file)) + except json.decoder.JSONDecodeError: + # Ignore decoding error only for the ddg bangs file, since this can + # occur if file is still being written + if i != 0: + raise + + bangs_dict = dict(sorted(bangs.items())) + + +def gen_bangs_json(bangs_file: str) -> None: + """Generates a json file from the DDG bangs list + + Args: + bangs_file: The str path to the new DDG bangs json file + + Returns: + None + + """ + try: + # Request full list from DDG + r = requests.get(DDG_BANGS) + r.raise_for_status() + except requests.exceptions.HTTPError as err: + raise SystemExit(err) + + # Convert to json + data = json.loads(r.text) + + # Set up a json object (with better formatting) for all available bangs + bangs_data = {} + + for row in data: + bang_command = '!' + row['t'] + bangs_data[bang_command] = { + 'url': row['u'].replace('{{{s}}}', '{}'), + 'suggestion': bang_command + ' (' + row['s'] + ')' + } + + json.dump(bangs_data, open(bangs_file, 'w')) + print('* Finished creating ddg bangs json') + load_all_bangs(bangs_file, bangs_data) + + +def suggest_bang(query: str) -> list[str]: + """Suggests bangs for a user's query + Args: + query: The search query + Returns: + list[str]: A list of bang suggestions + """ + global bangs_dict + return [bangs_dict[_]['suggestion'] for _ in bangs_dict if _.startswith(query)] + + +def resolve_bang(query: str) -> str: + """Transform's a user's query to a bang search, if an operator is found + + Args: + query: The search query + + Returns: + str: A formatted redirect for a bang search, or an empty str if there + wasn't a match or didn't contain a bang operator + + """ + + global bangs_dict + + #if ! not in query simply return (speed up processing) + if '!' not in query: + return '' + + split_query = query.strip().split(' ') + + # look for operator in query if one is found, list operator should be of + # length 1, operator should not be case-sensitive here to remove it later + operator = [ + word + for word in split_query + if word.lower() in bangs_dict + ] + if len(operator) == 1: + # get operator + operator = operator[0] + + # removes operator from query + split_query.remove(operator) + + # rebuild the query string + bang_query = ' '.join(split_query).strip() + + # Check if operator is a key in bangs and get bang if exists + bang = bangs_dict.get(operator.lower(), None) + if bang: + bang_url = bang['url'] + + if bang_query: + return bang_url.replace('{}', bang_query, 1) + else: + parsed_url = urlparse.urlparse(bang_url) + return f'{parsed_url.scheme}://{parsed_url.netloc}' + return '' diff --git a/app/utils/misc.py b/app/utils/misc.py new file mode 100755 index 0000000..ee6d62e --- /dev/null +++ b/app/utils/misc.py @@ -0,0 +1,137 @@ +import base64 +import hashlib +import contextlib +import io +import os +import re + +from requests import exceptions, get +from urllib.parse import urlparse +from bs4 import BeautifulSoup as bsoup +from cryptography.fernet import Fernet +from flask import Request + +ddg_favicon_site = 'http://icons.duckduckgo.com/ip2' + +empty_gif = base64.b64decode( + 'R0lGODlhAQABAIAAAP///////yH5BAEKAAEALAAAAAABAAEAAAICTAEAOw==') + +placeholder_img = base64.b64decode( + 'iVBORw0KGgoAAAANSUhEUgAAABkAAAAZCAYAAADE6YVjAAABF0lEQVRIS8XWPw9EMBQA8Eok' \ + 'JBKrMFqMBt//GzAYLTZ/VomExPDu6uLiaPteqVynBn0/75W2Vp7nEIYhe6p1XcespmmAd7Is' \ + 'M+4URcGiKPogvMMvmIS2eN9MOMKbKWgf54SYgI4vKkTuQKJKSJErkKzUSkQHUs0lilAg7GMh' \ + 'ISoIA/hYMiKCKIA2soeowCWEMkfHtUmrXLcyGYYBfN9HF8djiaglWzNZlgVs21YisoAUaEXG' \ + 'cQTP86QIFgi7vyLzPIPjOEIEC7ANQv/4aZrAdd0TUtc1i+MYnSsMWjPp+x6CIPgJVlUVS5KE' \ + 'DKig/+wnVzM4pnzaGeHd+ENlWbI0TbVLJBtw2uMfP63wc9d2kDCWxi5Q27bsBerSJ9afJbeL' \ + 'AAAAAElFTkSuQmCC' +) + + +def fetch_favicon(url: str) -> bytes: + """Fetches a favicon using DuckDuckGo's favicon retriever + + Args: + url: The url to fetch the favicon from + Returns: + bytes - the favicon bytes, or a placeholder image if one + was not returned + """ + response = get(f'{ddg_favicon_site}/{urlparse(url).netloc}.ico') + + if response.status_code == 200 and len(response.content) > 0: + tmp_mem = io.BytesIO() + tmp_mem.write(response.content) + tmp_mem.seek(0) + + return tmp_mem.read() + return placeholder_img + + +def gen_file_hash(path: str, static_file: str) -> str: + file_contents = open(os.path.join(path, static_file), 'rb').read() + file_hash = hashlib.md5(file_contents).hexdigest()[:8] + filename_split = os.path.splitext(static_file) + + return f'{filename_split[0]}.{file_hash}{filename_split[-1]}' + + +def read_config_bool(var: str, default: bool=False) -> bool: + val = os.getenv(var, '1' if default else '0') + # user can specify one of the following values as 'true' inputs (all + # variants with upper case letters will also work): + # ('true', 't', '1', 'yes', 'y') + return val.lower() in ('true', 't', '1', 'yes', 'y') + + +def get_client_ip(r: Request) -> str: + if r.environ.get('HTTP_X_FORWARDED_FOR') is None: + return r.environ['REMOTE_ADDR'] + return r.environ['HTTP_X_FORWARDED_FOR'] + + +def get_request_url(url: str) -> str: + if os.getenv('HTTPS_ONLY', False): + return url.replace('http://', 'https://', 1) + + return url + + +def get_proxy_host_url(r: Request, default: str, root=False) -> str: + scheme = r.headers.get('X-Forwarded-Proto', 'https') + http_host = r.headers.get('X-Forwarded-Host') + + full_path = r.full_path if not root else '' + if full_path.startswith('/'): + full_path = f'/{full_path}' + + if http_host: + prefix = os.environ.get('WHOOGLE_URL_PREFIX', '') + if prefix: + prefix = f'/{re.sub("[^0-9a-zA-Z]+", "", prefix)}' + return f'{scheme}://{http_host}{prefix}{full_path}' + + return default + + +def check_for_update(version_url: str, current: str) -> int: + # Check for the latest version of Whoogle + has_update = '' + with contextlib.suppress(exceptions.ConnectionError, AttributeError): + update = bsoup(get(version_url).text, 'html.parser') + latest = update.select_one('[class="Link--primary"]').string[1:] + current = int(''.join(filter(str.isdigit, current))) + latest = int(''.join(filter(str.isdigit, latest))) + has_update = '' if current >= latest else latest + + return has_update + + +def get_abs_url(url, page_url): + # Creates a valid absolute URL using a partial or relative URL + urls = { + "//": f"https:{url}", + "/": f"{urlparse(page_url).netloc}{url}", + "./": f"{page_url}{url[2:]}" + } + for start in urls: + if url.startswith(start): + return urls[start] + + return url + + +def list_to_dict(lst: list) -> dict: + if len(lst) < 2: + return {} + return {lst[i].replace(' ', ''): lst[i+1].replace(' ', '') + for i in range(0, len(lst), 2)} + + +def encrypt_string(key: bytes, string: str) -> str: + cipher_suite = Fernet(key) + return cipher_suite.encrypt(string.encode()).decode() + + +def decrypt_string(key: bytes, string: str) -> str: + cipher_suite = Fernet(g.session_key) + return cipher_suite.decrypt(string.encode()).decode() diff --git a/app/utils/results.py b/app/utils/results.py new file mode 100755 index 0000000..c87defd --- /dev/null +++ b/app/utils/results.py @@ -0,0 +1,466 @@ +from app.models.config import Config +from app.models.endpoint import Endpoint +from app.utils.misc import list_to_dict +from bs4 import BeautifulSoup, NavigableString +import copy +from flask import current_app +import html +import os +import urllib.parse as urlparse +from urllib.parse import parse_qs +import re +import warnings + +SKIP_ARGS = ['ref_src', 'utm'] +SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] +GOOG_STATIC = 'www.gstatic.com' +G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif' +GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' +LOGO_URL = GOOG_IMG + '_desk' +BLANK_B64 = ('data:image/png;base64,' + 'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw' + 'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC') + +# Ad keywords +BLACKLIST = [ + 'ad', 'ads', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告', + 'Reklama', 'Реклама', 'Anunț', '광고', 'annons', 'Annonse', 'Iklan', + '広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन', + 'Reklam', 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés', 'Szponzorált', + 'Anúncio', 'Quảng cáo','โฆษณา', 'sponsored', 'patrocinado', 'gesponsert', 'Sponzorováno', '스폰서', 'Gesponsord' +] + +SITE_ALTS = { + 'twitter.com': os.getenv('WHOOGLE_ALT_TW', 'farside.link/nitter'), + 'youtube.com': os.getenv('WHOOGLE_ALT_YT', 'farside.link/invidious'), + 'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'), + **dict.fromkeys([ + 'medium.com', + 'levelup.gitconnected.com' + ], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')), + 'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'), + 'wikipedia.org': os.getenv('WHOOGLE_ALT_WIKI', 'farside.link/wikiless'), + 'imdb.com': os.getenv('WHOOGLE_ALT_IMDB', 'farside.link/libremdb'), + 'quora.com': os.getenv('WHOOGLE_ALT_QUORA', 'farside.link/quetre'), + 'stackoverflow.com': os.getenv('WHOOGLE_ALT_SO', 'farside.link/anonymousoverflow') +} + +# Include custom site redirects from WHOOGLE_REDIRECTS +SITE_ALTS.update(list_to_dict(re.split(',|:', os.getenv('WHOOGLE_REDIRECTS', '')))) + + +def contains_cjko(s: str) -> bool: + """This function check whether or not a string contains Chinese, Japanese, + or Korean characters. It employs regex and uses the u escape sequence to + match any character in a set of Unicode ranges. + + Args: + s (str): string to be checked + + Returns: + bool: True if the input s contains the characters and False otherwise + """ + unicode_ranges = ('\u4e00-\u9fff' # Chinese characters + '\u3040-\u309f' # Japanese hiragana + '\u30a0-\u30ff' # Japanese katakana + '\u4e00-\u9faf' # Japanese kanji + '\uac00-\ud7af' # Korean hangul syllables + '\u1100-\u11ff' # Korean hangul jamo + ) + return bool(re.search(fr'[{unicode_ranges}]', s)) + + +def bold_search_terms(response: str, query: str) -> BeautifulSoup: + """Wraps all search terms in bold tags (). If any terms are wrapped + in quotes, only that exact phrase will be made bold. + + Args: + response: The initial response body for the query + query: The original search query + + Returns: + BeautifulSoup: modified soup object with bold items + """ + response = BeautifulSoup(response, 'html.parser') + + def replace_any_case(element: NavigableString, target_word: str) -> None: + # Replace all instances of the word, but maintaining the same case in + # the replacement + if len(element) == len(target_word): + return + + # Ensure target word is escaped for regex + target_word = re.escape(target_word) + + # Check if the word contains Chinese, Japanese, or Korean characters + if contains_cjko(target_word): + reg_pattern = fr'((?![{{}}<>-]){target_word}(?![{{}}<>-]))' + else: + reg_pattern = fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b' + + if re.match(r'.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or ( + element.parent and element.parent.name == 'style'): + return + + element.replace_with(BeautifulSoup( + re.sub(reg_pattern, + r'\1', + element, + flags=re.I), 'html.parser') + ) + + # Split all words out of query, grouping the ones wrapped in quotes + for word in re.split(r'\s+(?=[^"]*(?:"[^"]*"[^"]*)*$)', query): + word = re.sub(r'[@_!#$%^&*()<>?/\|}{~:]+', '', word) + target = response.find_all( + text=re.compile(r'' + re.escape(word), re.I)) + for nav_str in target: + replace_any_case(nav_str, word) + + return response + + +def has_ad_content(element: str) -> bool: + """Inspects an HTML element for ad related content + + Args: + element: The HTML element to inspect + + Returns: + bool: True/False for the element containing an ad + + """ + element_str = ''.join(filter(str.isalpha, element)) + return (element_str.upper() in (value.upper() for value in BLACKLIST) + or 'ⓘ' in element) + + +def get_first_link(soup: BeautifulSoup) -> str: + """Retrieves the first result link from the query response + + Args: + soup: The BeautifulSoup response body + + Returns: + str: A str link to the first result + + """ + first_link = '' + orig_details = [] + + # Temporarily remove details so we don't grab those links + for details in soup.find_all('details'): + temp_details = soup.new_tag('removed_details') + orig_details.append(details.replace_with(temp_details)) + + # Replace hrefs with only the intended destination (no "utm" type tags) + for a in soup.find_all('a', href=True): + # Return the first search result URL + if a['href'].startswith('http://') or a['href'].startswith('https://'): + first_link = a['href'] + break + + # Add the details back + for orig_detail, details in zip(orig_details, soup.find_all('removed_details')): + details.replace_with(orig_detail) + + return first_link + + +def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str: + """Returns an alternative to a particular site, if one is configured + + Args: + link: A string result URL to check against the site_alts map + site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS + + Returns: + str: An updated (or ignored) result link + + """ + # Need to replace full hostname with alternative to encapsulate + # subdomains as well + parsed_link = urlparse.urlparse(link) + + # Extract subdomain separately from the domain+tld. The subdomain + # is used for wikiless translations. + split_host = parsed_link.netloc.split('.') + subdomain = split_host[0] if len(split_host) > 2 else '' + hostname = '.'.join(split_host[-2:]) + + # The full scheme + hostname is used when comparing against the list of + # available alternative services, due to how Medium links are constructed. + # (i.e. for medium.com: "https://something.medium.com" should match, + # "https://medium.com/..." should match, but "philomedium.com" should not) + hostcomp = f'{parsed_link.scheme}://{hostname}' + + for site_key in site_alts.keys(): + site_alt = f'{parsed_link.scheme}://{site_key}' + if not hostname or site_alt not in hostcomp or not site_alts[site_key]: + continue + + # Wikipedia -> Wikiless replacements require the subdomain (if it's + # a 2-char language code) to be passed as a URL param to Wikiless + # in order to preserve the language setting. + params = '' + if 'wikipedia' in hostname and len(subdomain) == 2: + hostname = f'{subdomain}.{hostname}' + params = f'?lang={subdomain}' + elif 'medium' in hostname and len(subdomain) > 0: + hostname = f'{subdomain}.{hostname}' + + parsed_alt = urlparse.urlparse(site_alts[site_key]) + link = link.replace(hostname, site_alts[site_key]) + params + # If a scheme is specified in the alternative, this results in a + # replaced link that looks like "https://http://altservice.tld". + # In this case, we can remove the original scheme from the result + # and use the one specified for the alt. + if parsed_alt.scheme: + link = '//'.join(link.split('//')[1:]) + + for prefix in SKIP_PREFIX: + if parsed_alt.scheme: + # If a scheme is specified, remove everything before the + # first occurence of it + link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}' + else: + # Otherwise, replace the first occurrence of the prefix + link = link.replace(prefix, '//', 1) + break + + return link + + +def filter_link_args(link: str) -> str: + """Filters out unnecessary URL args from a result link + + Args: + link: The string result link to check for extraneous URL params + + Returns: + str: An updated (or ignored) result link + + """ + parsed_link = urlparse.urlparse(link) + link_args = parse_qs(parsed_link.query) + safe_args = {} + + if len(link_args) == 0 and len(parsed_link) > 0: + return link + + for arg in link_args.keys(): + if arg in SKIP_ARGS: + continue + + safe_args[arg] = link_args[arg] + + # Remove original link query and replace with filtered args + link = link.replace(parsed_link.query, '') + if len(safe_args) > 0: + link = link + urlparse.urlencode(safe_args, doseq=True) + else: + link = link.replace('?', '') + + return link + + +def append_nojs(result: BeautifulSoup) -> None: + """Appends a no-Javascript alternative for a search result + + Args: + result: The search result to append a no-JS link to + + Returns: + None + + """ + nojs_link = BeautifulSoup(features='html.parser').new_tag('a') + nojs_link['href'] = f'{Endpoint.window}?nojs=1&location=' + result['href'] + nojs_link.string = ' NoJS Link' + result.append(nojs_link) + + +def append_anon_view(result: BeautifulSoup, config: Config) -> None: + """Appends an 'anonymous view' for a search result, where all site + contents are viewed through Whoogle as a proxy. + + Args: + result: The search result to append an anon view link to + nojs: Remove Javascript from Anonymous View + + Returns: + None + + """ + av_link = BeautifulSoup(features='html.parser').new_tag('a') + nojs = 'nojs=1' if config.nojs else 'nojs=0' + location = f'location={result["href"]}' + av_link['href'] = f'{Endpoint.window}?{nojs}&{location}' + translation = current_app.config['TRANSLATIONS'][ + config.get_localization_lang() + ] + av_link.string = f'{translation["anon-view"]}' + av_link['class'] = 'anon-view' + result.append(av_link) + +def check_currency(response: str) -> dict: + """Check whether the results have currency conversion + + Args: + response: Search query Result + + Returns: + dict: Consists of currency names and values + + """ + soup = BeautifulSoup(response, 'html.parser') + currency_link = soup.find('a', {'href': 'https://g.co/gfd'}) + if currency_link: + while 'class' not in currency_link.attrs or \ + 'ZINbbc' not in currency_link.attrs['class']: + if currency_link.parent: + currency_link = currency_link.parent + else: + return {} + currency_link = currency_link.find_all(class_='BNeawe') + currency1 = currency_link[0].text + currency2 = currency_link[1].text + currency1 = currency1.rstrip('=').split(' ', 1) + currency2 = currency2.split(' ', 1) + + # Handle differences in currency formatting + # i.e. "5.000" vs "5,000" + if currency2[0][-3] == ',': + currency1[0] = currency1[0].replace('.', '') + currency1[0] = currency1[0].replace(',', '.') + currency2[0] = currency2[0].replace('.', '') + currency2[0] = currency2[0].replace(',', '.') + else: + currency1[0] = currency1[0].replace(',', '') + currency2[0] = currency2[0].replace(',', '') + + currency1_value = float(re.sub(r'[^\d\.]', '', currency1[0])) + currency1_label = currency1[1] + + currency2_value = float(re.sub(r'[^\d\.]', '', currency2[0])) + currency2_label = currency2[1] + + return {'currencyValue1': currency1_value, + 'currencyLabel1': currency1_label, + 'currencyValue2': currency2_value, + 'currencyLabel2': currency2_label + } + return {} + + +def add_currency_card(soup: BeautifulSoup, + conversion_details: dict) -> BeautifulSoup: + """Adds the currency conversion boxes + to response of the search query + + Args: + soup: Parsed search result + conversion_details: Dictionary of currency + related information + + Returns: + BeautifulSoup + """ + # Element before which the code will be changed + # (This is the 'disclaimer' link) + element1 = soup.find('a', {'href': 'https://g.co/gfd'}) + + while 'class' not in element1.attrs or \ + 'nXE3Ob' not in element1.attrs['class']: + element1 = element1.parent + + # Creating the conversion factor + conversion_factor = (conversion_details['currencyValue1'] / + conversion_details['currencyValue2']) + + # Creating a new div for the input boxes + conversion_box = soup.new_tag('div') + conversion_box['class'] = 'conversion_box' + + # Currency to be converted from + input_box1 = soup.new_tag('input') + input_box1['id'] = 'cb1' + input_box1['type'] = 'number' + input_box1['class'] = 'cb' + input_box1['value'] = conversion_details['currencyValue1'] + input_box1['oninput'] = f'convert(1, 2, {1 / conversion_factor})' + + label_box1 = soup.new_tag('label') + label_box1['for'] = 'cb1' + label_box1['class'] = 'cb_label' + label_box1.append(conversion_details['currencyLabel1']) + + br = soup.new_tag('br') + + # Currency to be converted to + input_box2 = soup.new_tag('input') + input_box2['id'] = 'cb2' + input_box2['type'] = 'number' + input_box2['class'] = 'cb' + input_box2['value'] = conversion_details['currencyValue2'] + input_box2['oninput'] = f'convert(2, 1, {conversion_factor})' + + label_box2 = soup.new_tag('label') + label_box2['for'] = 'cb2' + label_box2['class'] = 'cb_label' + label_box2.append(conversion_details['currencyLabel2']) + + conversion_box.append(input_box1) + conversion_box.append(label_box1) + conversion_box.append(br) + conversion_box.append(input_box2) + conversion_box.append(label_box2) + + element1.insert_before(conversion_box) + return soup + + +def get_tabs_content(tabs: dict, + full_query: str, + search_type: str, + preferences: str, + translation: dict) -> dict: + """Takes the default tabs content and updates it according to the query. + + Args: + tabs: The default content for the tabs + full_query: The original search query + search_type: The current search_type + translation: The translation to get the names of the tabs + + Returns: + dict: contains the name, the href and if the tab is selected or not + """ + map_query = full_query + if '-site:' in full_query: + block_idx = full_query.index('-site:') + map_query = map_query[:block_idx] + tabs = copy.deepcopy(tabs) + for tab_id, tab_content in tabs.items(): + # update name to desired language + if tab_id in translation: + tab_content['name'] = translation[tab_id] + + # update href with query + query = full_query.replace(f'&tbm={search_type}', '') + + if tab_content['tbm'] is not None: + query = f"{query}&tbm={tab_content['tbm']}" + + if preferences: + query = f"{query}&preferences={preferences}" + + tab_content['href'] = tab_content['href'].format( + query=query, + map_query=map_query) + + # update if selected tab (default all tab is selected) + if tab_content['tbm'] == search_type: + tabs['all']['selected'] = False + tab_content['selected'] = True + return tabs diff --git a/app/utils/search.py b/app/utils/search.py new file mode 100755 index 0000000..e76eee8 --- /dev/null +++ b/app/utils/search.py @@ -0,0 +1,194 @@ +import os +import re +from typing import Any +from app.filter import Filter +from app.request import gen_query +from app.utils.misc import get_proxy_host_url +from app.utils.results import get_first_link +from bs4 import BeautifulSoup as bsoup +from cryptography.fernet import Fernet, InvalidToken +from flask import g + +TOR_BANNER = '

You are using Tor


' +CAPTCHA = 'div class="g-recaptcha"' + + +def needs_https(url: str) -> bool: + """Checks if the current instance needs to be upgraded to HTTPS + + Note that all Heroku instances are available by default over HTTPS, but + do not automatically set up a redirect when visited over HTTP. + + Args: + url: The instance url + + Returns: + bool: True/False representing the need to upgrade + + """ + https_only = bool(os.getenv('HTTPS_ONLY', 0)) + is_heroku = url.endswith('.herokuapp.com') + is_http = url.startswith('http://') + + return (is_heroku and is_http) or (https_only and is_http) + + +def has_captcha(results: str) -> bool: + """Checks to see if the search results are blocked by a captcha + + Args: + results: The search page html as a string + + Returns: + bool: True/False indicating if a captcha element was found + + """ + return CAPTCHA in results + + +class Search: + """Search query preprocessor - used before submitting the query or + redirecting to another site + + Attributes: + request: the incoming flask request + config: the current user config settings + session_key: the flask user fernet key + """ + def __init__(self, request, config, session_key, cookies_disabled=False): + method = request.method + self.request = request + self.request_params = request.args if method == 'GET' else request.form + self.user_agent = request.headers.get('User-Agent') + self.feeling_lucky = False + self.config = config + self.session_key = session_key + self.query = '' + self.widget = '' + self.view_image = True + self.cookies_disabled = cookies_disabled + self.search_type = self.request_params.get( + 'tbm') if 'tbm' in self.request_params else '' + + def __getitem__(self, name) -> Any: + return getattr(self, name) + + def __setitem__(self, name, value) -> None: + return setattr(self, name, value) + + def __delitem__(self, name) -> None: + return delattr(self, name) + + def __contains__(self, name) -> bool: + return hasattr(self, name) + + def new_search_query(self) -> str: + """Parses a plaintext query into a valid string for submission + + Also decrypts the query string, if encrypted (in the case of + paginated results). + + Returns: + str: A valid query string + + """ + q = self.request_params.get('q') + + if q is None or len(q) == 0: + return '' + else: + # Attempt to decrypt if this is an internal link + try: + q = Fernet(self.session_key).decrypt(q.encode()).decode() + except InvalidToken: + pass + + # Strip '!' for "feeling lucky" queries + if match := re.search(r"(^|\s)!($|\s)", q): + self.feeling_lucky = True + start, end = match.span() + self.query = " ".join([seg for seg in [q[:start], q[end:]] if seg]) + else: + self.feeling_lucky = False + self.query = q + + # Check for possible widgets + self.widget = "ip" if re.search("([^a-z0-9]|^)my *[^a-z0-9] *(ip|internet protocol)" + + "($|( *[^a-z0-9] *(((addres|address|adres|" + + "adress)|a)? *$)))", self.query.lower()) else self.widget + self.widget = 'calculator' if re.search( + r"\bcalculator\b|\bcalc\b|\bcalclator\b|\bmath\b", + self.query.lower()) else self.widget + return self.query + + def generate_response(self) -> str: + """Generates a response for the user's query + + Returns: + str: A string response to the search query, in the form of a URL + or string representation of HTML content. + + """ + mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent + # reconstruct url if X-Forwarded-Host header present + root_url = get_proxy_host_url( + self.request, + self.request.url_root, + root=True) + + content_filter = Filter(self.session_key, + root_url=root_url, + mobile=mobile, + config=self.config, + query=self.query) + full_query = gen_query(self.query, + self.request_params, + self.config) + self.full_query = full_query + + # force mobile search when view image is true and + # the request is not already made by a mobile + view_image = ('tbm=isch' in full_query + # and self.config.view_image + #and not g.user_request.mobile + ) + + get_body = g.user_request.send(query=full_query, + force_mobile=view_image, + user_agent=self.user_agent) + + + # Produce cleanable html soup from response + get_body_safed = get_body.text.replace("<","andlt;").replace(">","andgt;") + html_soup = bsoup(get_body_safed, 'html.parser') + + # Replace current soup if view_image is active + if view_image: + html_soup = content_filter.view_image(html_soup) + + # Indicate whether or not a Tor connection is active + if g.user_request.tor_valid: + html_soup.insert(0, bsoup(TOR_BANNER, 'html.parser')) + + formatted_results = content_filter.clean(html_soup) + if self.feeling_lucky: + if lucky_link := get_first_link(formatted_results): + return lucky_link + + # Fall through to regular search if unable to find link + self.feeling_lucky = False + + # Append user config to all search links, if available + param_str = ''.join('&{}={}'.format(k, v) + for k, v in + self.request_params.to_dict(flat=True).items() + if self.config.is_safe_key(k)) + for link in formatted_results.find_all('a', href=True): + link['rel'] = "nofollow noopener noreferrer" + if 'search?' not in link['href'] or link['href'].index( + 'search?') > 1: + continue + link['href'] += param_str + + return str(formatted_results) + diff --git a/app/utils/session.py b/app/utils/session.py new file mode 100755 index 0000000..5bac42b --- /dev/null +++ b/app/utils/session.py @@ -0,0 +1,39 @@ +from cryptography.fernet import Fernet +from flask import current_app as app + +REQUIRED_SESSION_VALUES = ['uuid', 'config', 'key', 'auth'] + + +def generate_key() -> bytes: + """Generates a key for encrypting searches and element URLs + + Args: + cookies_disabled: Flag for whether or not cookies are disabled by the + user. If so, the user can only use the default key + generated on app init for queries. + + Returns: + str: A unique Fernet key + + """ + # Generate/regenerate unique key per user + return Fernet.generate_key() + + +def valid_user_session(session: dict) -> bool: + """Validates the current user session + + Args: + session: The current Flask user session + + Returns: + bool: True/False indicating that all required session values are + available + + """ + # Generate secret key for user if unavailable + for value in REQUIRED_SESSION_VALUES: + if value not in session: + return False + + return True diff --git a/app/utils/widgets.py b/app/utils/widgets.py new file mode 100755 index 0000000..156ada9 --- /dev/null +++ b/app/utils/widgets.py @@ -0,0 +1,71 @@ +from pathlib import Path +from bs4 import BeautifulSoup + + +# root +BASE_DIR = Path(__file__).parent.parent.parent + +def add_ip_card(html_soup: BeautifulSoup, ip: str) -> BeautifulSoup: + """Adds the client's IP address to the search results + if query contains keywords + + Args: + html_soup: The parsed search result containing the keywords + ip: ip address of the client + + Returns: + BeautifulSoup + + """ + main_div = html_soup.select_one('#main') + if main_div: + # HTML IP card tag + ip_tag = html_soup.new_tag('div') + ip_tag['class'] = 'ZINbbc xpd O9g5cc uUPGi' + + # For IP Address html tag + ip_address = html_soup.new_tag('div') + ip_address['class'] = 'kCrYT ip-address-div' + ip_address.string = ip + + # Text below the IP address + ip_text = html_soup.new_tag('div') + ip_text.string = 'Your public IP address' + ip_text['class'] = 'kCrYT ip-text-div' + + # Adding all the above html tags to the IP card + ip_tag.append(ip_address) + ip_tag.append(ip_text) + + # Insert the element at the top of the result list + main_div.insert_before(ip_tag) + return html_soup + +def add_calculator_card(html_soup: BeautifulSoup) -> BeautifulSoup: + """Adds the a calculator widget to the search results + if query contains keywords + + Args: + html_soup: The parsed search result containing the keywords + + Returns: + BeautifulSoup + """ + main_div = html_soup.select_one('#main') + if main_div: + # absolute path + widget_file = open(BASE_DIR / 'app/static/widgets/calculator.html', encoding="utf8") + widget_tag = html_soup.new_tag('div') + widget_tag['class'] = 'ZINbbc xpd O9g5cc uUPGi' + widget_tag['id'] = 'calculator-wrapper' + calculator_text = html_soup.new_tag('div') + calculator_text['class'] = 'kCrYT ip-address-div' + calculator_text.string = 'Calculator' + calculator_widget = html_soup.new_tag('div') + calculator_widget.append(BeautifulSoup(widget_file, 'html.parser')) + calculator_widget['class'] = 'kCrYT ip-text-div' + widget_tag.append(calculator_text) + widget_tag.append(calculator_widget) + main_div.insert_before(widget_tag) + widget_file.close() + return html_soup diff --git a/app/version.py b/app/version.py new file mode 100755 index 0000000..31eead6 --- /dev/null +++ b/app/version.py @@ -0,0 +1,7 @@ +import os + +optional_dev_tag = '' +if os.getenv('DEV_BUILD'): + optional_dev_tag = '.dev' + os.getenv('DEV_BUILD') + +__version__ = '0.9.1' + optional_dev_tag diff --git a/charts/whoogle/.helmignore b/charts/whoogle/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/charts/whoogle/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/charts/whoogle/Chart.yaml b/charts/whoogle/Chart.yaml new file mode 100644 index 0000000..8ce2224 --- /dev/null +++ b/charts/whoogle/Chart.yaml @@ -0,0 +1,23 @@ +apiVersion: v2 +name: whoogle +description: A self hosted search engine on Kubernetes +type: application +version: 0.1.0 +appVersion: 0.9.1 + +icon: https://github.com/benbusby/whoogle-search/raw/main/app/static/img/favicon/favicon-96x96.png + +sources: + - https://github.com/benbusby/whoogle-search + - https://gitlab.com/benbusby/whoogle-search + - https://gogs.benbusby.com/benbusby/whoogle-search + +keywords: + - whoogle + - degoogle + - search + - google + - search-engine + - privacy + - tor + - python diff --git a/charts/whoogle/templates/NOTES.txt b/charts/whoogle/templates/NOTES.txt new file mode 100644 index 0000000..bbbf070 --- /dev/null +++ b/charts/whoogle/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "whoogle.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "whoogle.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "whoogle.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "whoogle.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/charts/whoogle/templates/_helpers.tpl b/charts/whoogle/templates/_helpers.tpl new file mode 100644 index 0000000..4b51048 --- /dev/null +++ b/charts/whoogle/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "whoogle.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "whoogle.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "whoogle.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "whoogle.labels" -}} +helm.sh/chart: {{ include "whoogle.chart" . }} +{{ include "whoogle.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "whoogle.selectorLabels" -}} +app.kubernetes.io/name: {{ include "whoogle.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "whoogle.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "whoogle.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/charts/whoogle/templates/deployment.yaml b/charts/whoogle/templates/deployment.yaml new file mode 100644 index 0000000..3da9f1e --- /dev/null +++ b/charts/whoogle/templates/deployment.yaml @@ -0,0 +1,82 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "whoogle.fullname" . }} + labels: + {{- include "whoogle.labels" . | nindent 4 }} +spec: + {{- if not .Values.autoscaling.enabled }} + replicas: {{ .Values.replicaCount }} + {{- end }} + selector: + matchLabels: + {{- include "whoogle.selectorLabels" . | nindent 6 }} + template: + metadata: + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "whoogle.selectorLabels" . | nindent 8 }} + spec: + {{- with .Values.image.pullSecrets }} + imagePullSecrets: + {{- range .}} + - name: {{ . }} + {{- end }} + {{- end }} + serviceAccountName: {{ include "whoogle.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + containers: + - name: whoogle + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- with .Values.conf }} + env: + {{- range $k,$v := . }} + {{- if $v }} + - name: {{ $k }} + value: {{ tpl (toString $v) $ | quote }} + {{- end }} + {{- end }} + {{- end }} + ports: + - name: http + containerPort: {{ default 5000 .Values.conf.EXPOSE_PORT }} + protocol: TCP + livenessProbe: + httpGet: + path: / + port: http + {{- if and .Values.conf.WHOOGLE_USER .Values.conf.WHOOGLE_PASS }} + httpHeaders: + - name: Authorization + value: Basic {{ b64enc (printf "%s:%s" .Values.conf.WHOOGLE_USER .Values.conf.WHOOGLE_PASS) }} + {{- end }} + readinessProbe: + httpGet: + path: / + port: http + {{- if and .Values.conf.WHOOGLE_USER .Values.conf.WHOOGLE_PASS }} + httpHeaders: + - name: Authorization + value: Basic {{ b64enc (printf "%s:%s" .Values.conf.WHOOGLE_USER .Values.conf.WHOOGLE_PASS) }} + {{- end }} + resources: + {{- toYaml .Values.resources | nindent 12 }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/whoogle/templates/hpa.yaml b/charts/whoogle/templates/hpa.yaml new file mode 100644 index 0000000..74be742 --- /dev/null +++ b/charts/whoogle/templates/hpa.yaml @@ -0,0 +1,28 @@ +{{- if .Values.autoscaling.enabled }} +apiVersion: autoscaling/v2beta1 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "whoogle.fullname" . }} + labels: + {{- include "whoogle.labels" . | nindent 4 }} +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "whoogle.fullname" . }} + minReplicas: {{ .Values.autoscaling.minReplicas }} + maxReplicas: {{ .Values.autoscaling.maxReplicas }} + metrics: + {{- if .Values.autoscaling.targetCPUUtilizationPercentage }} + - type: Resource + resource: + name: cpu + targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }} + {{- end }} + {{- if .Values.autoscaling.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }} + {{- end }} +{{- end }} diff --git a/charts/whoogle/templates/ingress.yaml b/charts/whoogle/templates/ingress.yaml new file mode 100644 index 0000000..7fec7e9 --- /dev/null +++ b/charts/whoogle/templates/ingress.yaml @@ -0,0 +1,61 @@ +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "whoogle.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + labels: + {{- include "whoogle.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/charts/whoogle/templates/service.yaml b/charts/whoogle/templates/service.yaml new file mode 100644 index 0000000..96521c4 --- /dev/null +++ b/charts/whoogle/templates/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "whoogle.fullname" . }} + labels: + {{- include "whoogle.labels" . | nindent 4 }} +spec: + type: {{ .Values.service.type }} + ports: + - port: {{ .Values.service.port }} + targetPort: http + protocol: TCP + name: http + selector: + {{- include "whoogle.selectorLabels" . | nindent 4 }} diff --git a/charts/whoogle/templates/serviceaccount.yaml b/charts/whoogle/templates/serviceaccount.yaml new file mode 100644 index 0000000..de1398a --- /dev/null +++ b/charts/whoogle/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "whoogle.serviceAccountName" . }} + labels: + {{- include "whoogle.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/whoogle/templates/tests/test-connection.yaml b/charts/whoogle/templates/tests/test-connection.yaml new file mode 100644 index 0000000..bc06188 --- /dev/null +++ b/charts/whoogle/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "whoogle.fullname" . }}-test-connection" + labels: + {{- include "whoogle.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "whoogle.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/charts/whoogle/values.yaml b/charts/whoogle/values.yaml new file mode 100644 index 0000000..54beded --- /dev/null +++ b/charts/whoogle/values.yaml @@ -0,0 +1,115 @@ +# Default values for whoogle. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +nameOverride: "" +fullnameOverride: "" + +replicaCount: 1 +image: + repository: benbusby/whoogle-search + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + pullSecrets: [] + # - my-image-pull-secret + +serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +conf: {} + # WHOOGLE_URL_PREFIX: "" # The URL prefix to use for the whoogle instance (i.e. "/whoogle") + # WHOOGLE_DOTENV: "" # Load environment variables in whoogle.env + # WHOOGLE_USER: "" # The username for basic auth. WHOOGLE_PASS must also be set if used. + # WHOOGLE_PASS: "" # The password for basic auth. WHOOGLE_USER must also be set if used. + # WHOOGLE_PROXY_USER: "" # The username of the proxy server. + # WHOOGLE_PROXY_PASS: "" # The password of the proxy server. + # WHOOGLE_PROXY_TYPE: "" # The type of the proxy server. Can be "socks5", "socks4", or "http". + # WHOOGLE_PROXY_LOC: "" # The location of the proxy server (host or ip). + # EXPOSE_PORT: "" # The port where Whoogle will be exposed. (default 5000) + # HTTPS_ONLY: "" # Enforce HTTPS. (See https://github.com/benbusby/whoogle-search#https-enforcement) + # WHOOGLE_ALT_TW: "" # The twitter.com alternative to use when site alternatives are enabled in the config. + # WHOOGLE_ALT_YT: "" # The youtube.com alternative to use when site alternatives are enabled in the config. + # WHOOGLE_ALT_RD: "" # The reddit.com alternative to use when site alternatives are enabled in the config. + # WHOOGLE_ALT_TL: "" # The Google Translate alternative to use. This is used for all "translate ____" searches. + # WHOOGLE_ALT_MD: "" # The medium.com alternative to use when site alternatives are enabled in the config. + # WHOOGLE_ALT_IMG: "" # The imgur.com alternative to use when site alternatives are enabled in the config. + # WHOOGLE_ALT_WIKI: "" # The wikipedia.com alternative to use when site alternatives are enabled in the config. + # WHOOGLE_ALT_IMDB: "" # The imdb.com alternative to use. Set to "" to continue using imdb.com when site alternatives are enabled. + # WHOOGLE_ALT_QUORA: "" # The quora.com alternative to use. Set to "" to continue using quora.com when site alternatives are enabled. + # WHOOGLE_ALT_SO: "" # The stackoverflow.com alternative to use. Set to "" to continue using stackoverflow.com when site alternatives are enabled. + # WHOOGLE_AUTOCOMPLETE: "" # Controls visibility of autocomplete/search suggestions. Default on -- use '0' to disable + # WHOOGLE_MINIMAL: "" # Remove everything except basic result cards from all search queries. + + # WHOOGLE_CONFIG_DISABLE: "" # Hide config from UI and disallow changes to config by client + # WHOOGLE_CONFIG_COUNTRY: "" # Filter results by hosting country + # WHOOGLE_CONFIG_LANGUAGE: "" # Set interface language + # WHOOGLE_CONFIG_SEARCH_LANGUAGE: "" # Set search result language + # WHOOGLE_CONFIG_BLOCK: "" # Block websites from search results (use comma-separated list) + # WHOOGLE_CONFIG_THEME: "" # Set theme mode (light, dark, or system) + # WHOOGLE_CONFIG_SAFE: "" # Enable safe searches + # WHOOGLE_CONFIG_ALTS: "" # Use social media site alternatives (nitter, invidious, etc) + # WHOOGLE_CONFIG_NEAR: "" # Restrict results to only those near a particular city + # WHOOGLE_CONFIG_TOR: "" # Use Tor routing (if available) + # WHOOGLE_CONFIG_NEW_TAB: "" # Always open results in new tab + # WHOOGLE_CONFIG_VIEW_IMAGE: "" # Enable View Image option + # WHOOGLE_CONFIG_GET_ONLY: "" # Search using GET requests only + # WHOOGLE_CONFIG_URL: "" # The root url of the instance (https:///) + # WHOOGLE_CONFIG_STYLE: "" # The custom CSS to use for styling (should be single line) + # WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED: "" # Encrypt preferences token, requires key + # WHOOGLE_CONFIG_PREFERENCES_KEY: "" # Key to encrypt preferences in URL (REQUIRED to show url) + +podAnnotations: {} +podSecurityContext: {} + # fsGroup: 2000 +securityContext: + runAsUser: 0 + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + +service: + type: ClusterIP + port: 5000 + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # kubernetes.io/tls-acme: "true" + hosts: + - host: whoogle.example.com + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: chart-example-tls + # hosts: + # - whoogle.example.com + +resources: {} + # requests: + # cpu: 100m + # memory: 128Mi + # limits: + # cpu: 100m + # memory: 128Mi + +autoscaling: + enabled: false + minReplicas: 1 + maxReplicas: 100 + targetCPUUtilizationPercentage: 80 + # targetMemoryUtilizationPercentage: 80 + +nodeSelector: {} +tolerations: [] +affinity: {} diff --git a/docker-compose-traefik.yaml b/docker-compose-traefik.yaml new file mode 100644 index 0000000..4cfb588 --- /dev/null +++ b/docker-compose-traefik.yaml @@ -0,0 +1,81 @@ +# can't use mem_limit in a 3.x docker-compose file in non swarm mode +# see https://github.com/docker/compose/issues/4513 +version: "2.4" + +services: + traefik: + image: "traefik:v2.7" + container_name: "traefik" + command: + #- "--log.level=DEBUG" + - "--api.insecure=true" + - "--providers.docker=true" + - "--providers.docker.exposedbydefault=false" + - "--entrypoints.websecure.address=:443" + - "--certificatesresolvers.myresolver.acme.tlschallenge=true" + #- "--certificatesresolvers.myresolver.acme.caserver=https://acme-staging-v02.api.letsencrypt.org/directory" + - "--certificatesresolvers.myresolver.acme.email=change@domain.name" + - "--certificatesresolvers.myresolver.acme.storage=/letsencrypt/acme.json" + ports: + - "443:443" + - "8080:8080" + volumes: + - "./letsencrypt:/letsencrypt" + - "/var/run/docker.sock:/var/run/docker.sock:ro" + + whoogle-search: + labels: + - "traefik.enable=true" + - "traefik.http.routers.whoami.rule=Host(`change.host.name`)" + - "traefik.http.routers.whoami.entrypoints=websecure" + - "traefik.http.routers.whoami.tls.certresolver=myresolver" + - "traefik.http.services.whoogle-search.loadbalancer.server.port=5000" + image: ${WHOOGLE_IMAGE:-benbusby/whoogle-search} + container_name: whoogle-search + restart: unless-stopped + pids_limit: 50 + mem_limit: 256mb + memswap_limit: 256mb + # user debian-tor from tor package + user: whoogle + security_opt: + - no-new-privileges + cap_drop: + - ALL + tmpfs: + - /config/:size=10M,uid=927,gid=927,mode=1700 + - /var/lib/tor/:size=15M,uid=927,gid=927,mode=1700 + - /run/tor/:size=1M,uid=927,gid=927,mode=1700 + environment: # Uncomment to configure environment variables + # Basic auth configuration, uncomment to enable + #- WHOOGLE_USER= + #- WHOOGLE_PASS= + # Proxy configuration, uncomment to enable + #- WHOOGLE_PROXY_USER= + #- WHOOGLE_PROXY_PASS= + #- WHOOGLE_PROXY_TYPE= + # Site alternative configurations, uncomment to enable + # Note: If not set, the feature will still be available + # with default values. + #- WHOOGLE_ALT_TW=farside.link/nitter + #- WHOOGLE_ALT_YT=farside.link/invidious + #- WHOOGLE_ALT_IG=farside.link/bibliogram/u + #- WHOOGLE_ALT_RD=farside.link/libreddit + #- WHOOGLE_ALT_MD=farside.link/scribe + #- WHOOGLE_ALT_TL=farside.link/lingva + #- WHOOGLE_ALT_IMG=farside.link/rimgo + #- WHOOGLE_ALT_WIKI=farside.link/wikiless + #- WHOOGLE_ALT_IMDB=farside.link/libremdb + #- WHOOGLE_ALT_QUORA=farside.link/quetre + #- WHOOGLE_ALT_SO=farside.link/anonymousoverflow + # - WHOOGLE_CONFIG_DISABLE=1 + # - WHOOGLE_CONFIG_SEARCH_LANGUAGE=lang_en + # - WHOOGLE_CONFIG_GET_ONLY=1 + # - WHOOGLE_CONFIG_COUNTRY=FR + # - WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED=1 + # - WHOOGLE_CONFIG_PREFERENCES_KEY="NEEDS_TO_BE_MODIFIED" + #env_file: # Alternatively, load variables from whoogle.env + #- whoogle.env + ports: + - 8000:5000 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..0a693e6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,49 @@ +# can't use mem_limit in a 3.x docker-compose file in non swarm mode +# see https://github.com/docker/compose/issues/4513 +version: "2.4" + +services: + whoogle-search: + image: ${WHOOGLE_IMAGE:-benbusby/whoogle-search} + container_name: whoogle-search + restart: unless-stopped + pids_limit: 50 + mem_limit: 256mb + memswap_limit: 256mb + # user debian-tor from tor package + user: whoogle + security_opt: + - no-new-privileges + cap_drop: + - ALL + tmpfs: + - /config/:size=10M,uid=927,gid=927,mode=1700 + - /var/lib/tor/:size=15M,uid=927,gid=927,mode=1700 + - /run/tor/:size=1M,uid=927,gid=927,mode=1700 + #environment: # Uncomment to configure environment variables + # Basic auth configuration, uncomment to enable + #- WHOOGLE_USER= + #- WHOOGLE_PASS= + # Proxy configuration, uncomment to enable + #- WHOOGLE_PROXY_USER= + #- WHOOGLE_PROXY_PASS= + #- WHOOGLE_PROXY_TYPE= + # Site alternative configurations, uncomment to enable + # Note: If not set, the feature will still be available + # with default values. + #- WHOOGLE_ALT_TW=farside.link/nitter + #- WHOOGLE_ALT_YT=farside.link/invidious + #- WHOOGLE_ALT_IG=farside.link/bibliogram/u + #- WHOOGLE_ALT_RD=farside.link/libreddit + #- WHOOGLE_ALT_MD=farside.link/scribe + #- WHOOGLE_ALT_TL=farside.link/lingva + #- WHOOGLE_ALT_IMG=farside.link/rimgo + #- WHOOGLE_ALT_WIKI=farside.link/wikiless + #- WHOOGLE_ALT_IMDB=farside.link/libremdb + #- WHOOGLE_ALT_QUORA=farside.link/quetre + #- WHOOGLE_ALT_SO=farside.link/anonymousoverflow + #env_file: # Alternatively, load variables from whoogle.env + #- whoogle.env + ports: + - 5000:5000 diff --git a/docs/banner.png b/docs/banner.png new file mode 100644 index 0000000..2b895c1 Binary files /dev/null and b/docs/banner.png differ diff --git a/docs/screenshot_desktop.png b/docs/screenshot_desktop.png new file mode 100644 index 0000000..098ec13 Binary files /dev/null and b/docs/screenshot_desktop.png differ diff --git a/docs/screenshot_mobile.png b/docs/screenshot_mobile.png new file mode 100644 index 0000000..5d6e3f0 Binary files /dev/null and b/docs/screenshot_mobile.png differ diff --git a/heroku.yml b/heroku.yml new file mode 100644 index 0000000..cfc938e --- /dev/null +++ b/heroku.yml @@ -0,0 +1,4 @@ +build: + docker: + web: Dockerfile + diff --git a/letsencrypt/acme.json b/letsencrypt/acme.json new file mode 100644 index 0000000..e69de29 diff --git a/misc/heroku-regen.sh b/misc/heroku-regen.sh new file mode 100755 index 0000000..198edcf --- /dev/null +++ b/misc/heroku-regen.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Assumes this is being executed from a session that has already logged +# into Heroku with "heroku login -i" beforehand. +# +# You can set this up to run every night when you aren't using the +# instance with a cronjob. For example: +# 0 3 * * * /home/pi/whoogle-search/config/heroku-regen.sh + +HEROKU_CLI_SITE="https://devcenter.heroku.com/articles/heroku-cli" + +if ! [[ -x "$(command -v heroku)" ]]; then + echo "Must have heroku cli installed: $HEROKU_CLI_SITE" + exit 1 +fi + +cd "$(builtin cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)/../" + +if [[ $# -ne 1 ]]; then + echo -e "Must provide the name of the Whoogle instance to regenerate" + exit 1 +fi + +APP_NAME="$1" + +heroku apps:destroy "$APP_NAME" --confirm "$APP_NAME" +heroku apps:create "$APP_NAME" +heroku container:login +heroku container:push web +heroku container:release web diff --git a/misc/instances.txt b/misc/instances.txt new file mode 100644 index 0000000..228b0c5 --- /dev/null +++ b/misc/instances.txt @@ -0,0 +1,24 @@ +https://search.albony.xyz +https://search.garudalinux.org +https://search.dr460nf1r3.org +https://search.nezumi.party +https://s.tokhmi.xyz +https://search.sethforprivacy.com +https://whoogle.dcs0.hu +https://whoogle.lunar.icu +https://gowogle.voring.me +https://whoogle.privacydev.net +https://whoogle.hostux.net +https://wg.vern.cc +https://whoogle.hxvy0.gq +https://whoogle.ungovernable.men +https://whoogle2.ungovernable.men +https://whoogle3.ungovernable.men +https://wgl.frail.duckdns.org +https://whoogle.no-logs.com +https://whoogle.ftw.lol +https://whoogle-search--replitcomreside.repl.co +https://search.notrustverify.ch +https://whoogle.datura.network +https://whoogle.yepserver.xyz +https://search.snine.nl diff --git a/misc/replit.py b/misc/replit.py new file mode 100644 index 0000000..ce222c7 --- /dev/null +++ b/misc/replit.py @@ -0,0 +1,5 @@ +import subprocess + +# A plague upon Replit and all who have built it +replit_cmd = "killall -q python3 > /dev/null 2>&1; pip install -r requirements.txt && ./run" +subprocess.run(replit_cmd, shell=True) diff --git a/misc/tor/start-tor.sh b/misc/tor/start-tor.sh new file mode 100755 index 0000000..8d77ee6 --- /dev/null +++ b/misc/tor/start-tor.sh @@ -0,0 +1,33 @@ +#!/bin/sh + +FF_STRING="FascistFirewall 1" + +if [ "$WHOOGLE_TOR_SERVICE" == "0" ]; then + echo "Skipping Tor startup..." + exit 0 +fi + +if [ "$WHOOGLE_TOR_FF" == "1" ]; then + if (grep -q "$FF_STRING" /etc/tor/torrc); then + echo "FascistFirewall feature already enabled." + else + echo "$FF_STRING" >> /etc/tor/torrc + + if [ "$?" -eq 0 ]; then + echo "FascistFirewall added to /etc/tor/torrc" + else + echo "ERROR: Unable to modify /etc/tor/torrc with $FF_STRING." + exit 1 + fi + fi +fi + +if [ "$(whoami)" != "root" ]; then + tor -f /etc/tor/torrc +else + if (grep alpine /etc/os-release >/dev/null); then + rc-service tor start + else + service tor start + fi +fi diff --git a/misc/tor/torrc b/misc/tor/torrc new file mode 100644 index 0000000..30c6638 --- /dev/null +++ b/misc/tor/torrc @@ -0,0 +1,12 @@ +DataDirectory /var/lib/tor +ControlPort 9051 +CookieAuthentication 1 +DataDirectoryGroupReadable 1 +CookieAuthFileGroupReadable 1 +ExtORPortCookieAuthFileGroupReadable 1 +CacheDirectoryGroupReadable 1 +CookieAuthFile /var/lib/tor/control_auth_cookie +Log debug-notice file /dev/null +# UseBridges 1 +# ClientTransportPlugin obfs4 exec /usr/bin/obfs4proxy +# Bridge obfs4 ip and so on diff --git a/misc/update-translations.py b/misc/update-translations.py new file mode 100644 index 0000000..d5388f1 --- /dev/null +++ b/misc/update-translations.py @@ -0,0 +1,67 @@ +import json +import pathlib +import requests + +lingva = 'https://lingva.ml/api/v1/en' + + +def format_lang(lang: str) -> str: + # Chinese (traditional and simplified) require + # a different format for lingva translations + if 'zh-' in lang: + if lang == 'zh-TW': + return 'zh_HANT' + return 'zh' + + # Strip lang prefix to leave only the actual + # language code (i.e. 'en', 'fr', etc) + return lang.replace('lang_', '') + + +def translate(v: str, lang: str) -> str: + # Strip lang prefix to leave only the actual + #language code (i.e. "es", "fr", etc) + lang = format_lang(lang) + + lingva_req = f'{lingva}/{lang}/{v}' + + response = requests.get(lingva_req).json() + + if 'translation' in response: + return response['translation'] + return '' + + +if __name__ == '__main__': + file_path = pathlib.Path(__file__).parent.resolve() + tl_path = 'app/static/settings/translations.json' + + with open(f'{file_path}/../{tl_path}', 'r+', encoding='utf-8') as tl_file: + tl_data = json.load(tl_file) + + # If there are any english translations that don't + # exist for other languages, extract them and translate + # them now + en_tl = tl_data['lang_en'] + for k, v in en_tl.items(): + for lang in tl_data: + if lang == 'lang_en' or k in tl_data[lang]: + continue + + translation = '' + if len(k) == 0: + # Special case for placeholder text that gets used + # for translations without any key present + translation = v + else: + # Translate the string using lingva + translation = translate(v, lang) + + if len(translation) == 0: + print(f'! Unable to translate {lang}[{k}]') + continue + print(f'{lang}[{k}] = {translation}') + tl_data[lang][k] = translation + + # Write out updated translations json + print(json.dumps(tl_data, indent=4, ensure_ascii=False)) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..9787c3b --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools", "wheel"] +build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2a39b9b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,37 @@ +attrs==22.2.0 +beautifulsoup4==4.11.2 +brotli==1.0.9 +cachelib==0.10.2 +certifi==2024.7.4 +cffi==1.17.1 +chardet==5.1.0 +click==8.1.3 +cryptography==3.3.2; platform_machine == 'armv7l' +cryptography==43.0.1; platform_machine != 'armv7l' +cssutils==2.6.0 +defusedxml==0.7.1 +Flask==2.3.2 +idna==3.7 +itsdangerous==2.1.2 +Jinja2==3.1.4 +MarkupSafe==2.1.2 +more-itertools==9.0.0 +packaging==23.0 +pluggy==1.0.0 +pycodestyle==2.10.0 +pycparser==2.22 +pyOpenSSL==19.1.0; platform_machine == 'armv7l' +pyOpenSSL==24.2.1; platform_machine != 'armv7l' +pyparsing==3.0.9 +PySocks==1.7.1 +pytest==7.2.1 +python-dateutil==2.8.2 +requests==2.32.2 +soupsieve==2.4 +stem==1.8.1 +urllib3==1.26.19 +validators==0.22.0 +waitress==3.0.1 +wcwidth==0.2.6 +Werkzeug==3.0.6 +python-dotenv==0.21.1 diff --git a/run b/run new file mode 100755 index 0000000..18c2f66 --- /dev/null +++ b/run @@ -0,0 +1,37 @@ +#!/bin/sh +# Usage: +# ./run # Runs the full web app +# ./run test # Runs the testing suite + +set -e + +SCRIPT_DIR="$(CDPATH= command cd -- "$(dirname -- "$0")" && pwd -P)" + +# Set directory to serve static content from +SUBDIR="${1:-app}" +export APP_ROOT="$SCRIPT_DIR/$SUBDIR" +export STATIC_FOLDER="$APP_ROOT/static" + +# Clear out build directory +rm -f "$SCRIPT_DIR"/app/static/build/*.js +rm -f "$SCRIPT_DIR"/app/static/build/*.css + +# Check for regular vs test run +if [ "$SUBDIR" = "test" ]; then + # Set up static files for testing + rm -rf "$STATIC_FOLDER" + ln -s "$SCRIPT_DIR/app/static" "$STATIC_FOLDER" + pytest -sv +else + mkdir -p "$STATIC_FOLDER" + + if [ ! -z "$UNIX_SOCKET" ]; then + python3 -um app \ + --unix-socket "$UNIX_SOCKET" + else + echo "Running on http://${ADDRESS:-0.0.0.0}:${PORT:-"${EXPOSE_PORT:-8000}"}" + python3 -um app \ + --host "${ADDRESS:-0.0.0.0}" \ + --port "${PORT:-"${EXPOSE_PORT:-8000}"}" + fi +fi diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..6e61f45 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,45 @@ +[metadata] +name = whoogle-search +version = attr: app.version.__version__ +url = https://github.com/benbusby/whoogle-search +description = Self-hosted, ad-free, privacy-respecting metasearch engine +long_description = file: README.md +long_description_content_type = text/markdown +keywords = search, metasearch, flask, adblock, degoogle, privacy +author = Ben Busby +author_email = contact@benbusby.com +license = MIT +classifiers = + Programming Language :: Python :: 3 + License :: OSI Approved :: MIT License + Operating System :: OS Independent + +[options] +packages = find: +include_package_data = True +install_requires= + beautifulsoup4 + brotli + cssutils + cryptography + defusedxml + Flask + python-dotenv + requests + stem + validators + waitress + +[options.extras_require] +test = + pytest + python-dateutil +dev = pycodestyle + +[options.packages.find] +exclude = + test* + +[options.entry_points] +console_scripts = + whoogle-search = app.routes:run_app diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/conftest.py b/test/conftest.py new file mode 100644 index 0000000..cec3def --- /dev/null +++ b/test/conftest.py @@ -0,0 +1,24 @@ +from app import app +from app.utils.session import generate_key +import pytest +import random + +demo_config = { + 'near': random.choice(['Seattle', 'New York', 'San Francisco']), + 'dark': str(random.getrandbits(1)), + 'nojs': str(random.getrandbits(1)), + 'lang_interface': random.choice(app.config['LANGUAGES'])['value'], + 'lang_search': random.choice(app.config['LANGUAGES'])['value'], + 'country': random.choice(app.config['COUNTRIES'])['value'] +} + + +@pytest.fixture +def client(): + with app.test_client() as client: + with client.session_transaction() as session: + session['uuid'] = 'test' + session['key'] = app.enc_key + session['config'] = {} + session['auth'] = False + yield client diff --git a/test/test_autocomplete.py b/test/test_autocomplete.py new file mode 100644 index 0000000..194a5ab --- /dev/null +++ b/test/test_autocomplete.py @@ -0,0 +1,16 @@ +from app.models.endpoint import Endpoint + + +def test_autocomplete_get(client): + rv = client.get(f'/{Endpoint.autocomplete}?q=green+eggs+and') + assert rv._status_code == 200 + assert len(rv.data) >= 1 + assert b'green eggs and ham' in rv.data + + +def test_autocomplete_post(client): + rv = client.post(f'/{Endpoint.autocomplete}', + data=dict(q='the+cat+in+the')) + assert rv._status_code == 200 + assert len(rv.data) >= 1 + assert b'the cat in the hat' in rv.data diff --git a/test/test_misc.py b/test/test_misc.py new file mode 100644 index 0000000..177af5f --- /dev/null +++ b/test/test_misc.py @@ -0,0 +1,70 @@ +from cryptography.fernet import Fernet + +from app import app +from app.models.endpoint import Endpoint +from app.utils.session import generate_key, valid_user_session + +JAPAN_PREFS = 'uG7IBICwK7FgMJNpUawp2tKDb1Omuv_euy-cJHVZ' \ + + 'BSydthgwxRFIHxiVA8qUGavKaDXyiM5uNuPIjKbEAW-zB_vzNXWVaafFhW7k2' \ + + 'fO2_mS5e5eK41XXWwiViTz2VVmGWje0UgQwwVPe1A7aH0s10FgARsd2xl5nlg' \ + + 'RLHT2krPUw-iLQ5uHZSnYXFuF4caYemWcj4vqB2ocHkt-aqn04jgnnlWWME_K' \ + + '9ySWdWmPyS66HtLt1tCwc_-xGZklvbHw==' + + +def test_generate_user_keys(): + key = generate_key() + assert Fernet(key) + assert generate_key() != key + + +def test_valid_session(client): + assert not valid_user_session({'key': '', 'config': {}}) + with client.session_transaction() as session: + assert valid_user_session(session) + + +def test_valid_translation_keys(client): + valid_lang_keys = [_['value'] for _ in app.config['LANGUAGES']] + en_keys = app.config['TRANSLATIONS']['lang_en'].keys() + for translation_key in app.config['TRANSLATIONS']: + # Ensure the translation is using a valid language value + assert translation_key in valid_lang_keys + + # Ensure all translations match the same size/content of the original + # English translation + assert app.config['TRANSLATIONS'][translation_key].keys() == en_keys + + +def test_query_decryption(client): + # FIXME: Handle decryption errors in search.py and rewrite test + # This previously was used to test swapping decryption keys between + # queries. While this worked in theory and usually didn't cause problems, + # they were tied to session IDs and those are really unreliable (meaning + # that occasionally page navigation would break). + rv = client.get('/') + cookie = rv.headers['Set-Cookie'] + + rv = client.get(f'/{Endpoint.search}?q=test+1', headers={'Cookie': cookie}) + assert rv._status_code == 200 + + with client.session_transaction() as session: + assert valid_user_session(session) + + rv = client.get(f'/{Endpoint.search}?q=test+2', headers={'Cookie': cookie}) + assert rv._status_code == 200 + + with client.session_transaction() as session: + assert valid_user_session(session) + + +def test_prefs_url(client): + base_url = f'/{Endpoint.search}?q=wikipedia' + rv = client.get(base_url) + assert rv._status_code == 200 + assert b'wikipedia.org' in rv.data + assert b'ja.wikipedia.org' not in rv.data + + rv = client.get(f'{base_url}&preferences={JAPAN_PREFS}') + assert rv._status_code == 200 + assert b'ja.wikipedia.org' in rv.data + diff --git a/test/test_results.py b/test/test_results.py new file mode 100644 index 0000000..ad0fd3e --- /dev/null +++ b/test/test_results.py @@ -0,0 +1,158 @@ +from bs4 import BeautifulSoup +from app.filter import Filter +from app.models.config import Config +from app.models.endpoint import Endpoint +from app.utils import results +from app.utils.session import generate_key +from datetime import datetime +from dateutil.parser import ParserError, parse +from urllib.parse import urlparse + +from test.conftest import demo_config + + +def get_search_results(data): + secret_key = generate_key() + soup = Filter(user_key=secret_key, config=Config(**demo_config)).clean( + BeautifulSoup(data, 'html.parser')) + + main_divs = soup.find('div', {'id': 'main'}) + assert len(main_divs) > 1 + + result_divs = [] + for div in main_divs: + # Result divs should only have 1 inner div + if (len(list(div.children)) != 1 + or not div.findChild() + or 'div' not in div.findChild().name): + continue + + result_divs.append(div) + + return result_divs + + +def test_get_results(client): + rv = client.get(f'/{Endpoint.search}?q=test') + assert rv._status_code == 200 + + # Depending on the search, there can be more + # than 10 result divs + results = get_search_results(rv.data) + assert len(results) >= 10 + assert len(results) <= 15 + + +def test_post_results(client): + rv = client.post(f'/{Endpoint.search}', data=dict(q='test')) + assert rv._status_code == 302 + + +def test_translate_search(client): + rv = client.get(f'/{Endpoint.search}?q=translate hola') + assert rv._status_code == 200 + + # Pretty weak test, but better than nothing + str_data = str(rv.data) + assert 'iframe' in str_data + assert '/auto/en/ hola' in str_data + + +def test_block_results(client): + rv = client.get(f'/{Endpoint.search}?q=pinterest') + assert rv._status_code == 200 + + has_pinterest = False + for link in BeautifulSoup(rv.data, 'html.parser').find_all('a', href=True): + if 'pinterest.com' in urlparse(link['href']).netloc: + has_pinterest = True + break + + assert has_pinterest + + demo_config['block'] = 'pinterest.com' + rv = client.post(f'/{Endpoint.config}', data=demo_config) + assert rv._status_code == 302 + + rv = client.get(f'/{Endpoint.search}?q=pinterest') + assert rv._status_code == 200 + + for link in BeautifulSoup(rv.data, 'html.parser').find_all('a', href=True): + result_site = urlparse(link['href']).netloc + if not result_site: + continue + assert result_site not in 'pinterest.com' + + +def test_view_my_ip(client): + rv = client.get(f'/{Endpoint.search}?q=my ip address') + assert rv._status_code == 200 + + # Pretty weak test, but better than nothing + str_data = str(rv.data) + assert 'Your public IP address' in str_data + assert '127.0.0.1' in str_data + + +def test_recent_results(client): + times = { + 'tbs=qdr:y': 365, + 'tbs=qdr:m': 31, + 'tbs=qdr:w': 7 + } + + for time, num_days in times.items(): + rv = client.get(f'/{Endpoint.search}?q=test&' + time) + result_divs = get_search_results(rv.data) + + current_date = datetime.now() + for div in [_ for _ in result_divs if _.find('span')]: + date_span = div.find('span').decode_contents() + if not date_span or len(date_span) > 15 or len(date_span) < 7: + continue + + try: + date = parse(date_span) + # Date can have a little bit of wiggle room + assert (current_date - date).days <= (num_days + 5) + except ParserError: + pass + + +def test_leading_slash_search(client): + # Ensure searches with a leading slash are interpreted + # correctly as queries and not endpoints + q = '/test' + rv = client.get(f'/{Endpoint.search}?q={q}') + assert rv._status_code == 200 + + soup = Filter( + user_key=generate_key(), + config=Config(**demo_config), + query=q + ).clean(BeautifulSoup(rv.data, 'html.parser')) + + for link in soup.find_all('a', href=True): + if 'start=' not in link['href']: + continue + + assert link['href'].startswith(f'{Endpoint.search}') + + +def test_site_alt_prefix_skip(): + # Ensure prefixes are skipped correctly for site alts + + # default silte_alts (farside.link) + assert results.get_site_alt(link = 'https://www.reddit.com') == 'https://farside.link/libreddit' + assert results.get_site_alt(link = 'https://www.twitter.com') == 'https://farside.link/nitter' + assert results.get_site_alt(link = 'https://www.youtube.com') == 'https://farside.link/invidious' + + test_site_alts = { + 'reddit.com': 'reddit.endswithmobile.domain', + 'twitter.com': 'https://twitter.endswithm.domain', + 'youtube.com': 'http://yt.endswithwww.domain', + } + # Domains with part of SKIP_PREFIX in them + assert results.get_site_alt(link = 'https://www.reddit.com', site_alts = test_site_alts) == 'https://reddit.endswithmobile.domain' + assert results.get_site_alt(link = 'https://www.twitter.com', site_alts = test_site_alts) == 'https://twitter.endswithm.domain' + assert results.get_site_alt(link = 'https://www.youtube.com', site_alts = test_site_alts) == 'http://yt.endswithwww.domain' diff --git a/test/test_routes.py b/test/test_routes.py new file mode 100644 index 0000000..1f64827 --- /dev/null +++ b/test/test_routes.py @@ -0,0 +1,91 @@ +from app import app +from app.models.endpoint import Endpoint + +import json + +from test.conftest import demo_config + + +def test_main(client): + rv = client.get('/') + assert rv._status_code == 200 + + +def test_search(client): + rv = client.get(f'/{Endpoint.search}?q=test') + assert rv._status_code == 200 + + +def test_feeling_lucky(client): + # Bang at beginning of query + rv = client.get(f'/{Endpoint.search}?q=!%20wikipedia') + assert rv._status_code == 303 + assert rv.headers.get('Location').startswith('https://www.wikipedia.org') + + # Move bang to end of query + rv = client.get(f'/{Endpoint.search}?q=github%20!') + assert rv._status_code == 303 + assert rv.headers.get('Location').startswith('https://github.com') + + +def test_ddg_bang(client): + # Bang at beginning of query + rv = client.get(f'/{Endpoint.search}?q=!gh%20whoogle') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('https://github.com') + + # Move bang to end of query + rv = client.get(f'/{Endpoint.search}?q=github%20!w') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('https://en.wikipedia.org') + + # Move bang to middle of query + rv = client.get(f'/{Endpoint.search}?q=big%20!r%20chungus') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('https://www.reddit.com') + + # Ensure bang is case insensitive + rv = client.get(f'/{Endpoint.search}?q=!GH%20whoogle') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('https://github.com') + + # Ensure bang without a query still redirects to the result + rv = client.get(f'/{Endpoint.search}?q=!gh') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('https://github.com') + + +def test_custom_bang(client): + # Bang at beginning of query + rv = client.get(f'/{Endpoint.search}?q=!i%20whoogle') + assert rv._status_code == 302 + assert rv.headers.get('Location').startswith('search?q=') + + +def test_config(client): + rv = client.post(f'/{Endpoint.config}', data=demo_config) + assert rv._status_code == 302 + + rv = client.get(f'/{Endpoint.config}') + assert rv._status_code == 200 + + config = json.loads(rv.data) + for key in demo_config.keys(): + assert config[key] == demo_config[key] + + # Test disabling changing config from client + app.config['CONFIG_DISABLE'] = 1 + dark_mod = not demo_config['dark'] + demo_config['dark'] = dark_mod + rv = client.post(f'/{Endpoint.config}', data=demo_config) + assert rv._status_code == 403 + + rv = client.get(f'/{Endpoint.config}') + config = json.loads(rv.data) + assert config['dark'] != dark_mod + + +def test_opensearch(client): + rv = client.get(f'/{Endpoint.opensearch}') + assert rv._status_code == 200 + assert 'Whoogle' in str(rv.data) diff --git a/whoogle.template.env b/whoogle.template.env new file mode 100644 index 0000000..ee2a502 --- /dev/null +++ b/whoogle.template.env @@ -0,0 +1,94 @@ +# ---------------------------------- +# Rename to "whoogle.env" before use +# ---------------------------------- +# You can set Whoogle environment variables here, but must +# modify your deployment to enable these values: +# - Local: Set WHOOGLE_DOTENV=1 +# - docker-compose: Uncomment the env_file option +# - docker: Add "--env-file ./whoogle.env" to your build command + +#WHOOGLE_ALT_TW=farside.link/nitter +#WHOOGLE_ALT_YT=farside.link/invidious +#WHOOGLE_ALT_IG=farside.link/bibliogram/u +#WHOOGLE_ALT_RD=farside.link/libreddit +#WHOOGLE_ALT_MD=farside.link/scribe +#WHOOGLE_ALT_TL=farside.link/lingva +#WHOOGLE_ALT_IMG=farside.link/rimgo +#WHOOGLE_ALT_WIKI=farside.link/wikiless +#WHOOGLE_ALT_IMDB=farside.link/libremdb +#WHOOGLE_ALT_QUORA=farside.link/quetre +#WHOOGLE_ALT_SO=farside.link/anonymousoverflow +#WHOOGLE_USER="" +#WHOOGLE_PASS="" +#WHOOGLE_PROXY_USER="" +#WHOOGLE_PROXY_PASS="" +#WHOOGLE_PROXY_TYPE="" +#WHOOGLE_PROXY_LOC="" +#WHOOGLE_CSP=1 +#HTTPS_ONLY=1 + +# The URL prefix to use for the whoogle instance (i.e. "/whoogle") +#WHOOGLE_URL_PREFIX="" + +# Restrict results to only those near a particular city +#WHOOGLE_CONFIG_NEAR=denver + +# See app/static/settings/countries.json for values +#WHOOGLE_CONFIG_COUNTRY=US + +# See app/static/settings/languages.json for values +#WHOOGLE_CONFIG_LANGUAGE=lang_en + +# See app/static/settings/languages.json for values +#WHOOGLE_CONFIG_SEARCH_LANGUAGE=lang_en + +# Disable changing of config from client +#WHOOGLE_CONFIG_DISABLE=1 + +# Block websites from search results (comma-separated list) +#WHOOGLE_CONFIG_BLOCK=pinterest.com,whitehouse.gov + +# Theme (light, dark, or system) +#WHOOGLE_CONFIG_THEME=system + +# Safe search mode +#WHOOGLE_CONFIG_SAFE=1 + +# Use social media site alternatives (nitter, bibliogram, etc) +#WHOOGLE_CONFIG_ALTS=1 + +# Use Tor if available +#WHOOGLE_CONFIG_TOR=1 + +# Open results in new tab +#WHOOGLE_CONFIG_NEW_TAB=1 + +# Enable View Image option +#WHOOGLE_CONFIG_VIEW_IMAGE=1 + +# Search using GET requests only (exposes query in logs) +#WHOOGLE_CONFIG_GET_ONLY=1 + +# Remove everything except basic result cards from all search queries +#WHOOGLE_MINIMAL=0 + +# Set the number of results per page +#WHOOGLE_RESULTS_PER_PAGE=10 + +# Controls visibility of autocomplete/search suggestions +#WHOOGLE_AUTOCOMPLETE=1 + +# The port where Whoogle will be exposed +#EXPOSE_PORT=5000 + +# Set instance URL +#WHOOGLE_CONFIG_URL=https:/// + +# Set custom CSS styling/theming +#WHOOGLE_CONFIG_STYLE=":root { /* LIGHT THEME COLORS */ --whoogle-background: #d8dee9; --whoogle-accent: #2e3440; --whoogle-text: #3B4252; --whoogle-contrast-text: #eceff4; --whoogle-secondary-text: #70757a; --whoogle-result-bg: #fff; --whoogle-result-title: #4c566a; --whoogle-result-url: #81a1c1; --whoogle-result-visited: #a3be8c; /* DARK THEME COLORS */ --whoogle-dark-background: #222; --whoogle-dark-accent: #685e79; --whoogle-dark-text: #fff; --whoogle-dark-contrast-text: #000; --whoogle-dark-secondary-text: #bbb; --whoogle-dark-result-bg: #000; --whoogle-dark-result-title: #1967d2; --whoogle-dark-result-url: #4b11a8; --whoogle-dark-result-visited: #bbbbff; }" + +# Enable preferences encryption (requires key) +#WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED=1 + +# Set Key to encode config in url +#WHOOGLE_CONFIG_PREFERENCES_KEY="NEEDS_TO_BE_MODIFIED" \ No newline at end of file