Как вы отправляете много документов на сервер Scout в Python с помощью клиента Python Scout? - PullRequest
0 голосов
/ 10 февраля 2020

Я пытаюсь проиндексировать текст PDF в python lib с именем Scout . Я пытался сделать то же самое с упругим поиском тоже. В обоих случаях я не могу понять, как разместить текст в индексе массово, используя python.

После долгих исследований я считаю, что мне нужно использовать asyn c http request . Единственная проблема в том, что я не понимаю asyn c звонки и не понимаю, что такое Scout python «клиент» на самом деле. Я программист-самоучка, и у меня все еще много вещей, которые я не понимаю. я думаю, что клиент не может оставаться открытым для al oop, чтобы продолжать использовать соединение. Я видел концепции кодирования, такие как «ожидание» и «сеансы» во многих книгах по программированию. Однако я не знаю, как реализовать эти концепции. Может ли кто-нибудь помочь мне написать какой-нибудь код python, который успешно разместит новые документы на работающем скаут-сервере и объяснит, как это делается?

Вот моя попытка:

from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template,  json
# before you start, Run the Server.py file and create a Sqlite DB
# Step one loop though PDF in 'books' folder
for k in range(14,15):
    # open the pdf file
    read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
# Test to see if Step one is complete and succesful
    #print (read_pdf)
# Step Two Gain intel on how many Pages are in the Document
    # get the page numbers
    num = read_pdf.getNumPages()
    print ("PDF pages:", num)
# Step Three understand the data by page
    # create a dictionary object for page data
    all_pages = []
# Step For Create a new index in Scout Server
    # client.create_index('test3')
    # iterate the page numbers
    for page in range(num):
        data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()
    # extract the page's text
        page_text = data.extractText()
        # put the text data into the dict
        all_pages.append(page_text)
# initiate the Client from scout_client.py
    client = Scout('http://localhost:8000')
# THe issue: I tryed for loops, and while loops but cant get past: urllib.error.HTTPError: HTTP Error 400: BAD REQUEST
    i = 1
    while i <= num:
        client.create_document(all_pages[i],['test3'])
        print(i,"....done")
        i += 1

Я получаю ошибка:

Traceback (most recent call last):
  File "test.py", line 37, in <module>
    client.create_document(all_pages[i],['test3'])
  File "../Searchtest4/scout/scout_client.py", line 149, in create_document
    return self.post('/documents/', post_data, attachments)
  File "../Searchtest4/scout/scout_client.py", line 53, in post
    return self.post_json(url, data)
  File "../Searchtest4/scout/scout_client.py", line 63, in post_json
    return json.loads(urlopen(request).read().decode('utf8'))
  File "../lib/python3.7/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "../lib/python3.7/urllib/request.py", line 531, in open
    response = meth(req, response)
  File "../lib/python3.7/urllib/request.py", line 641, in http_response
    'http', request, response, code, msg, hdrs)
  File "../lib/python3.7/urllib/request.py", line 569, in error
    return self._call_chain(*args)
  File "../lib/python3.7/urllib/request.py", line 503, in _call_chain
    result = func(*args)
  File "../lib/python3.7/urllib/request.py", line 649, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
**urllib.error.HTTPError: HTTP Error 400: BAD REQUEST**

Вот сервер, который работает нормально (server.py):

import logging
import optparse
import os
import sys

from flask import Flask
from werkzeug.serving import run_simple

from scout.exceptions import InvalidRequestException
from scout.models import database
from scout.models import Attachment
from scout.models import BlobData
from scout.models import Document
from scout.models import Index
from scout.models import IndexDocument
from scout.models import Metadata
from scout.views import register_views


logger = logging.getLogger('scout')


def create_server(config=None, config_file=None):
    app = Flask(__name__)

    # Configure application using a config file.
    if config_file is not None:
        app.config.from_pyfile(config_file)

    # (Re-)Configure application using command-line switches/environment flags.
    if config is not None:
        app.config.update(config)

    # Initialize the SQLite database.
    initialize_database(app.config.get('DATABASE') or 'scout.db',
                        pragmas=app.config.get('SQLITE_PRAGMAS') or None)
    register_views(app)

    @app.errorhandler(InvalidRequestException)
    def handle_invalid_request(exc):
        return exc.response()

    @app.before_request
    def connect_database():
        if database.database != ':memory:':
            database.connect()

    @app.teardown_request
    def close_database(exc):
        if database.database != ':memory:' and not database.is_closed():
            database.close()

    return app


def initialize_database(database_file, pragmas=None):
    database.init(database_file, pragmas=pragmas)
    try:
        meth = database.execution_context
    except AttributeError:
        meth = database

    with meth:
        database.create_tables([
            Attachment,
            BlobData,
            Document,
            Index,
            IndexDocument,
            Metadata])


def run(app):
    if app.config['DEBUG']:
        app.run(host=app.config['HOST'], port=app.config['PORT'], debug=True)
    else:
        run_simple(
            hostname=app.config['HOST'],
            port=app.config['PORT'],
            application=app,
            threaded=True)


def panic(s, exit_code=1):
    sys.stderr.write('\033[91m%s\033[0m\n' % s)
    sys.stderr.flush()
    sys.exit(exit_code)


def get_option_parser():
    parser = optparse.OptionParser()
    parser.add_option(
        '-H',
        '--host',
        default='127.0.0.1',
        dest='host',
        help='The hostname to listen on. Defaults to 127.0.0.1.')
    parser.add_option(
        '-p',
        '--port',
        default=8000,
        dest='port',
        help='The port to listen on. Defaults to 8000.',
        type='int')
    parser.add_option(
        '-u',
        '--url-prefix',
        dest='url_prefix',
        help='URL path to prefix Scout API.')
    parser.add_option(
        '-s',
        '--stem',
        dest='stem',
        help='Specify stemming algorithm for content.')
    parser.add_option(
        '-d',
        '--debug',
        action='store_true',
        dest='debug',
        help='Run Flask app in debug mode.')
    parser.add_option(
        '-c',
        '--config',
        dest='config',
        help='Configuration module (python file).')
    parser.add_option(
        '--paginate-by',
        default=50,
        dest='paginate_by',
        help='Number of documents displayed per page of results, default=50',
        type='int')
    parser.add_option(
        '-k',
        '--api-key',
        dest='api_key',
        help='Set the API key required to access Scout.')
    parser.add_option(
        '-C',
        '--cache-size',
        default=64,
        dest='cache_size',
        help='SQLite page-cache size (MB). Defaults to 64MB.',
        type='int')
    parser.add_option(
        '-f',
        '--fsync',
        action='store_true',
        dest='fsync',
        help='Synchronize database to disk on every write.')
    parser.add_option(
        '-j',
        '--journal-mode',
        default='wal',
        dest='journal_mode',
        help='SQLite journal mode. Defaults to WAL (recommended).')
    parser.add_option(
        '-l',
        '--logfile',
        dest='logfile',
        help='Log file')
    return parser

def parse_options():
    option_parser = get_option_parser()
    options, args = option_parser.parse_args()

    if options.logfile:
        handler = logging.FileHandler(options.logfile)
        logger.addHandler(handler)

    config_file = os.environ.get('SCOUT_CONFIG') or options.config
    config = {'DATABASE': os.environ.get('SCOUT_DATABASE')}

    if len(args) == 0 and not config['DATABASE']:
        panic('Error: missing required path to database file.')
    elif len(args) > 1:
        panic('Error: [%s] only accepts one argument, which is the path '
              'to the database file.' % __file__)
    elif args:
        config['DATABASE'] = args[0]

    pragmas = [('journal_mode', options.journal_mode)]
    if options.cache_size:
        pragmas.append(('cache_size', -1024 * options.cache_size))
    if not options.fsync:
        pragmas.append(('synchronous', 0))

    config['SQLITE_PRAGMAS'] = pragmas

    # Handle command-line options. These values will override any values
    # that may have been specified in the config file.
    if options.api_key:
        config['AUTHENTICATION'] = options.api_key
    if options.debug:
        config['DEBUG'] = True
    config['HOST'] = options.host or '127.0.0.1'
    config['PORT'] = options.port or 8000
    config['URL_PREFIX'] = options.url_prefix or ''
    if options.paginate_by:
        if options.paginate_by < 1 or options.paginate_by > 1000:
            panic('paginate-by must be between 1 and 1000')
        config['PAGINATE_BY'] = options.paginate_by
    if options.stem:
        if options.stem not in ('simple', 'porter'):
            panic('Unrecognized stemmer. Must be "porter" or "simple".')
        config['STEM'] = options.stem

    return create_server(config, config_file)


def main():
    app = parse_options()
    run(app)


if __name__ == '__main__':
    main()

и так называемый клиент (scout_client.py):

import base64
import json
try:
    from email.generator import _make_boundary as choose_boundary
except ImportError:
    from mimetools import choose_boundary
import mimetypes
import os
try:
    from urllib.parse import urlencode
except ImportError:
    from urllib import urlencode
try:
    from urllib.request import Request
    from urllib.request import urlopen
except ImportError:
    from urllib2 import Request
    from urllib2 import urlopen
import zlib


ENDPOINT = None
KEY = None


class Scout(object):
    def __init__(self, endpoint=ENDPOINT, key=KEY):
        self.endpoint = endpoint.rstrip('/')
        self.key = key

    def get_full_url(self, url):
        return self.endpoint + url

    def get_raw(self, url, **kwargs):
        headers = {'Content-Type': 'application/json'}
        if self.key:
            headers['key'] = self.key
        if kwargs:
            if '?' not in url:
                url += '?'
            url += urlencode(kwargs, True)
        request = Request(self.get_full_url(url), headers=headers)
        fh = urlopen(request)
        return fh.read()

    def get(self, url, **kwargs):
        return json.loads(self.get_raw(url, **kwargs))

    def post(self, url, data=None, files=None):
        if files:
            return self.post_files(url, data, files)
        else:
            return self.post_json(url, data)

    def post_json(self, url, data=None):
        headers = {'Content-Type': 'application/json'}
        if self.key:
            headers['key'] = self.key
        data = json.dumps(data or {})
        if not isinstance(data, bytes):
            data = data.encode('utf-8')
        request = Request(self.get_full_url(url), data=data, headers=headers)
        return json.loads(urlopen(request).read().decode('utf8'))

    def post_files(self, url, json_data, files=None):
        if not files or not isinstance(files, dict):
            raise ValueError('One or more files is required. Files should be '
                             'passed as a dictionary of filename: file-like-'
                             'object.')
        boundary = choose_boundary()
        form_files = []
        for i, (filename, file_obj) in enumerate(files.items()):
            try:
                data = file_obj.read()
            except AttributeError:
                data = bytes(file_obj)
            mimetype = mimetypes.guess_type(filename)[0]
            form_files.append((
                'file_%s' % i,
                filename,
                mimetype or 'application/octet-stream',
                data))

        part_boundary = '--' + boundary
        parts = [
            part_boundary,
            'Content-Disposition: form-data; name="data"',
            '',
            json.dumps(json_data)]
        for field_name, filename, mimetype, data in form_files:
            parts.extend((
                part_boundary,
                'Content-Disposition: file; name="%s"; filename="%s"' % (
                    field_name, filename),
                'Content-Type: %s' % mimetype,
                '',
                data))
        parts.append('--' + boundary + '--')
        parts.append('')

        headers = {'Content-Type': 'multipart/form-data; boundary=%s' %
                   boundary}
        if self.key:
            headers['key'] = self.key

        data = '\r\n'.join(parts)
        if not isinstance(data, bytes):
            data = data.encode('utf-8')

        request = Request(self.get_full_url(url), data=data, headers=headers)
        return json.loads(urlopen(request).read())

    def delete(self, url):
        headers = {}
        if self.key:
            headers['key'] = self.key
        request = Request(self.get_full_url(url), headers=headers)
        request.get_method = lambda: 'DELETE'
        fh = urlopen(request)
        return json.loads(fh.read())

    def get_indexes(self, **kwargs):
        return self.get('/', **kwargs)['indexes']

    def create_index(self, name):
        return self.post('/', {'name': name})

    def rename_index(self, old_name, new_name):
        return self.post('/%s/' % old_name, {'name': new_name})

    def delete_index(self, name):
        return self.delete('/%s/' % name)

    def get_index(self, name, **kwargs):
        return self.get('/%s/' % name, **kwargs)

    def get_documents(self, **kwargs):
        return self.get('/documents/', **kwargs)

    def create_document(self, content, indexes, identifier=None,
                        attachments=None, **metadata):
        if not isinstance(indexes, (list, tuple)):
            indexes = [indexes]
        post_data = {
            'content': content,
            'identifier': identifier,
            'indexes': indexes,
            'metadata': metadata}
        return self.post('/documents/', post_data, attachments)

    def update_document(self, document_id=None, content=None, indexes=None,
                        metadata=None, identifier=None, attachments=None):
        if not document_id and not identifier:
            raise ValueError('`document_id` must be provided.')

        data = {}
        if content is not None:
            data['content'] = content
        if indexes is not None:
            if not isinstance(indexes, (list, tuple)):
                indexes = [indexes]
            data['indexes'] = indexes
        if metadata is not None:
            data['metadata'] = metadata

        if not data and not attachments:
            raise ValueError('Nothing to update.')

        return self.post('/documents/%s/' % document_id, data, attachments)

    def delete_document(self, document_id=None):
        if not document_id:
            raise ValueError('`document_id` must be provided.')

        return self.delete('/documents/%s/' % document_id)

    def get_document(self, document_id=None):
        if not document_id:
            raise ValueError('`document_id` must be provided.')

        return self.get('/documents/%s/' % document_id)

    def attach_files(self, document_id, attachments):
        return self.post_files('/documents/%s/attachments/' % document_id,
                               {}, attachments)

    def detach_file(self, document_id, filename):
        return self.delete('/documents/%s/attachments/%s/' %
                           (document_id, filename))

    def update_file(self, document_id, filename, file_object):
        return self.post_files('/documents/%s/attachments/%s/' %
                               (document_id, filename),
                               {}, {filename: file_object})

    def get_attachments(self, document_id, **kwargs):
        return self.get('/documents/%s/attachments/' % document_id, **kwargs)

    def get_attachment(self, document_id, filename):
        return self.get('/documents/%s/attachments/%s/' %
                        (document_id, filename))

    def download_attachment(self, document_id, filename):
        return self.get_raw('/documents/%s/attachments/%s/download/' %
                            (document_id, filename))

    def search_attachments(self, **kwargs):
        return self.get('/documents/attachments/', **kwargs)


class SearchProvider(object):
    def content(self, obj):
        raise NotImplementedError

    def identifier(self, obj):
        raise NotImplementedError

    def metadata(self, obj):
        raise NotImplementedError


class SearchSite(object):
    def __init__(self, client, index):
        self.client = client
        self.index = index
        self.registry = {}

    def register(self, model_class, search_provider):
        self.registry.setdefault(model_class, [])
        self.registry[model_class].append(search_provider())

    def unregister(self, model_class, search_provider=None):
        if search_provider is None:
            self.registry.pop(model_class, None)
        elif model_class in self.registry:
            self.registry[model_class] = [
                sp for sp in self.registry[model_class]
                if not isinstance(sp, search_provider)]

    def store(self, obj):
        if type(obj) not in self.registry:
            return False

        for provider in self.registry[type(obj)]:
            content = provider.content(obj)
            try:
                metadata = provider.metadata(obj)
            except NotImplementedError:
                metadata = {}

            try:
                identifier = provider.identifier(obj)
            except NotImplementedError:
                pass
            else:
                metadata['identifier'] = identifier

            self.client.create_document(content, self.index, **metadata)

        return True

    def remove(self, obj):
        if type(obj) not in self.registry:
            return False

        for provider in self.registry[type(obj)]:
            self.client.delete_document(provider.identifier(obj))

        return True

Наконец документация для разведчика:

Любая подробная помощь очень ценится:)

1 Ответ

0 голосов
/ 11 февраля 2020

Так что я нахожу библиотеку под названием scout и ... заставил ее работать!

from scout_client import Scout
# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import os
from flask import Flask, jsonify, request, render_template,  json

client = Scout('http://localhost:8000')

for k in range(7,18):
    read_pdf = PyPDF2.PdfFileReader("books/%s.pdf"%(k))
    num = read_pdf.getNumPages()
    print ("PDF pages:", num)
    all_pages = []
    for page in range(num):
        data = read_pdf.getPage(page) 
        page_text = data.extractText()  
        all_pages.append(page_text)

    import requests
    for z in all_pages:
        url = 'http://localhost:8000/documents/'
        data = {'content': z, 'indexes': ['test13']}
        headers = {
        'Content-Type': 'application/json',
        }

        response = requests.post(url, data=json.dumps(data), headers=headers)

    print(response)
  • Теперь я могу l oop, хотя столько PDF, сколько мне нужно локально
  • Публикация на сервере для индексации
  • и поиск по ключевым словам

Теперь мне просто нужна помощь в создании базового c внешнего интерфейса с панелью поиска, которая вызывает данные из JSON ответ в python и flask.

...