Data sources

`WikipediaLoader`

Bases: BaseWebAPIDataLoader

Source code in autoresearcher/data_sources/web_apis/wikipedia_loader.py

class WikipediaLoader(BaseWebAPIDataLoader):
    def __init__(self):
        super().__init__("https://en.wikipedia.org/w/api.php")

    def fetch_data(self, search_query, results=10, language="en"):
        """
        Fetches data from the Wikipedia API.
        Args:
          search_query (str): The query to search for.
          results (int, optional): The maximum number of results to return. Defaults to 10.
          language (str, optional): The language to search in. Defaults to "en".
        Returns:
          list: A list of dictionaries containing the data for each result.
        Raises:
          wikipedia.exceptions.DisambiguationError: If the search query returns a disambiguation page.
        Examples:
          >>> loader = WikipediaLoader()
          >>> loader.fetch_data("Python")
          [
            {
              "title": "Python (programming language)",
              "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
              "summary": "Python is an interpreted, high-level, general-purpose programming language.",
              "content": "Python is an interpreted, high-level, general-purpose programming language...",
              "categories": ["Programming languages"],
              "references": ["https://www.python.org/"]
            }
          ]
        """
        wikipedia.set_lang(language)
        wikipedia.set_rate_limiting(True)

        search_results = wikipedia.search(search_query, results=results)
        data = []

        for result in search_results:
            try:
                page = wikipedia.page(result)
                data.append(
                    {
                        "title": page.title,
                        "url": page.url,
                        "summary": page.summary,
                        "content": page.content,
                        "categories": page.categories,
                        "references": page.references,
                    }
                )
            except wikipedia.exceptions.DisambiguationError as e:
                # Handle disambiguation pages by selecting the first option
                if e.options:
                    page = wikipedia.page(e.options[0])
                    data.append(
                        {
                            "title": page.title,
                            "url": page.url,
                            "summary": page.summary,
                            "content": page.content,
                            "categories": page.categories,
                            "references": page.references,
                        }
                    )
            except wikipedia.exceptions.PageError:
                # Skip pages that cannot be found
                continue

        return data

`fetch_data(search_query, results=10, language='en')`

Fetches data from the Wikipedia API.

Parameters:

Name	Type	Description	Default
`search_query`	`str`	The query to search for.	required
`results`	`int`	The maximum number of results to return. Defaults to 10.	`10`
`language`	`str`	The language to search in. Defaults to "en".	`'en'`

Returns:

Name	Type	Description
`list`		A list of dictionaries containing the data for each result.

Raises:

Type	Description
`wikipedia.exceptions.DisambiguationError`	If the search query returns a disambiguation page.

Examples:

>>> loader = WikipediaLoader()
>>> loader.fetch_data("Python")
[
  {
    "title": "Python (programming language)",
    "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
    "summary": "Python is an interpreted, high-level, general-purpose programming language.",
    "content": "Python is an interpreted, high-level, general-purpose programming language...",
    "categories": ["Programming languages"],
    "references": ["https://www.python.org/"]
  }
]

Source code in autoresearcher/data_sources/web_apis/wikipedia_loader.py

def fetch_data(self, search_query, results=10, language="en"):
    """
    Fetches data from the Wikipedia API.
    Args:
      search_query (str): The query to search for.
      results (int, optional): The maximum number of results to return. Defaults to 10.
      language (str, optional): The language to search in. Defaults to "en".
    Returns:
      list: A list of dictionaries containing the data for each result.
    Raises:
      wikipedia.exceptions.DisambiguationError: If the search query returns a disambiguation page.
    Examples:
      >>> loader = WikipediaLoader()
      >>> loader.fetch_data("Python")
      [
        {
          "title": "Python (programming language)",
          "url": "https://en.wikipedia.org/wiki/Python_(programming_language)",
          "summary": "Python is an interpreted, high-level, general-purpose programming language.",
          "content": "Python is an interpreted, high-level, general-purpose programming language...",
          "categories": ["Programming languages"],
          "references": ["https://www.python.org/"]
        }
      ]
    """
    wikipedia.set_lang(language)
    wikipedia.set_rate_limiting(True)

    search_results = wikipedia.search(search_query, results=results)
    data = []

    for result in search_results:
        try:
            page = wikipedia.page(result)
            data.append(
                {
                    "title": page.title,
                    "url": page.url,
                    "summary": page.summary,
                    "content": page.content,
                    "categories": page.categories,
                    "references": page.references,
                }
            )
        except wikipedia.exceptions.DisambiguationError as e:
            # Handle disambiguation pages by selecting the first option
            if e.options:
                page = wikipedia.page(e.options[0])
                data.append(
                    {
                        "title": page.title,
                        "url": page.url,
                        "summary": page.summary,
                        "content": page.content,
                        "categories": page.categories,
                        "references": page.references,
                    }
                )
        except wikipedia.exceptions.PageError:
            # Skip pages that cannot be found
            continue

    return data

`SemanticScholarLoader`

Bases: BaseWebAPIDataLoader

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py

class SemanticScholarLoader(BaseWebAPIDataLoader):
    def __init__(self):
        """
        Initializes the SemanticScholarLoader class.
        Args:
          None
        Returns:
          None
        Notes:
          Calls the superclass constructor with the SemanticScholar API URL.
        """
        super().__init__("https://api.semanticscholar.org/graph/v1/paper/search")

    def fetch_data(self, search_query, limit=100, year_range=None):
        """
        Fetches data from the SemanticScholar API.
        Args:
          search_query (str): The query to search for.
          limit (int, optional): The maximum number of results to return. Defaults to 100.
          year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
        Returns:
          list: A list of paper objects.
        Examples:
          >>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
          [{...}, {...}, ...]
        """
        params = {
            "query": search_query,
            "limit": limit,
            "fields": "title,url,abstract,authors,citationStyles,journal,citationCount,year,externalIds",
        }

        if year_range is not None:
            params["year"] = year_range

        data = self.make_request("", params=params)
        return data.get("data", [])

    def fetch_and_sort_papers(
        self,
        search_query,
        limit=100,
        top_n=20,
        year_range=None,
        keyword_combinations=None,
        weight_similarity=0.5,
    ):
        """
        Fetches and sorts papers from the SemanticScholar API.
        Args:
          search_query (str): The query to search for.
          limit (int, optional): The maximum number of results to return. Defaults to 100.
          top_n (int, optional): The maximum number of results to return after sorting. Defaults to 20.
          year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
          keyword_combinations (list, optional): A list of keyword combinations to search for. Defaults to None.
          weight_similarity (float, optional): The weight to give to the similarity score when sorting. Defaults to 0.5.
        Returns:
          list: A list of the top `top_n` paper objects sorted by combined score.
        Examples:
          >>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
          [{...}, {...}, ...]
        """
        papers = []
        if keyword_combinations is None:
            keyword_combinations = [search_query]

        for combination in keyword_combinations:
            papers.extend(self.fetch_data(combination, limit, year_range))

        max_citations = max(papers, key=lambda x: x["citationCount"])["citationCount"]

        for paper in papers:
            similarity = jellyfish.jaro_similarity(search_query, paper["title"])
            normalized_citation_count = paper["citationCount"] / max_citations
            paper["combined_score"] = (weight_similarity * similarity) + (
                (1 - weight_similarity) * normalized_citation_count
            )

        sorted_papers = sorted(papers, key=lambda x: x["combined_score"], reverse=True)

        # deduplicate paper entries prior to taking top n results
        sorted_dedup_papers = list(
            {each_paper["paperId"]: each_paper for each_paper in sorted_papers}.values()
        )

        return sorted_dedup_papers[:top_n]

`init()`

Initializes the SemanticScholarLoader class.

Returns:

Type	Description
	None

Notes

Calls the superclass constructor with the SemanticScholar API URL.

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py

def __init__(self):
    """
    Initializes the SemanticScholarLoader class.
    Args:
      None
    Returns:
      None
    Notes:
      Calls the superclass constructor with the SemanticScholar API URL.
    """
    super().__init__("https://api.semanticscholar.org/graph/v1/paper/search")

`fetch_and_sort_papers(search_query, limit=100, top_n=20, year_range=None, keyword_combinations=None, weight_similarity=0.5)`

Fetches and sorts papers from the SemanticScholar API.

Parameters:

Name	Type	Description	Default
`search_query`	`str`	The query to search for.	required
`limit`	`int`	The maximum number of results to return. Defaults to 100.	`100`
`top_n`	`int`	The maximum number of results to return after sorting. Defaults to 20.	`20`
`year_range`	`tuple`	A tuple of two integers representing the start and end year of the search. Defaults to None.	`None`
`keyword_combinations`	`list`	A list of keyword combinations to search for. Defaults to None.	`None`
`weight_similarity`	`float`	The weight to give to the similarity score when sorting. Defaults to 0.5.	`0.5`

Returns:

Name	Type	Description
`list`		A list of the top `top_n` paper objects sorted by combined score.

Examples:

>>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
[{...}, {...}, ...]

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py

def fetch_and_sort_papers(
    self,
    search_query,
    limit=100,
    top_n=20,
    year_range=None,
    keyword_combinations=None,
    weight_similarity=0.5,
):
    """
    Fetches and sorts papers from the SemanticScholar API.
    Args:
      search_query (str): The query to search for.
      limit (int, optional): The maximum number of results to return. Defaults to 100.
      top_n (int, optional): The maximum number of results to return after sorting. Defaults to 20.
      year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
      keyword_combinations (list, optional): A list of keyword combinations to search for. Defaults to None.
      weight_similarity (float, optional): The weight to give to the similarity score when sorting. Defaults to 0.5.
    Returns:
      list: A list of the top `top_n` paper objects sorted by combined score.
    Examples:
      >>> fetch_and_sort_papers("machine learning", limit=50, top_n=10, year_range=(2010, 2020))
      [{...}, {...}, ...]
    """
    papers = []
    if keyword_combinations is None:
        keyword_combinations = [search_query]

    for combination in keyword_combinations:
        papers.extend(self.fetch_data(combination, limit, year_range))

    max_citations = max(papers, key=lambda x: x["citationCount"])["citationCount"]

    for paper in papers:
        similarity = jellyfish.jaro_similarity(search_query, paper["title"])
        normalized_citation_count = paper["citationCount"] / max_citations
        paper["combined_score"] = (weight_similarity * similarity) + (
            (1 - weight_similarity) * normalized_citation_count
        )

    sorted_papers = sorted(papers, key=lambda x: x["combined_score"], reverse=True)

    # deduplicate paper entries prior to taking top n results
    sorted_dedup_papers = list(
        {each_paper["paperId"]: each_paper for each_paper in sorted_papers}.values()
    )

    return sorted_dedup_papers[:top_n]

`fetch_data(search_query, limit=100, year_range=None)`

Fetches data from the SemanticScholar API.

Parameters:

Name	Type	Description	Default
`search_query`	`str`	The query to search for.	required
`limit`	`int`	The maximum number of results to return. Defaults to 100.	`100`
`year_range`	`tuple`	A tuple of two integers representing the start and end year of the search. Defaults to None.	`None`

Returns:

Name	Type	Description
`list`		A list of paper objects.

Examples:

>>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
[{...}, {...}, ...]

Source code in autoresearcher/data_sources/web_apis/semantic_scholar_loader.py

def fetch_data(self, search_query, limit=100, year_range=None):
    """
    Fetches data from the SemanticScholar API.
    Args:
      search_query (str): The query to search for.
      limit (int, optional): The maximum number of results to return. Defaults to 100.
      year_range (tuple, optional): A tuple of two integers representing the start and end year of the search. Defaults to None.
    Returns:
      list: A list of paper objects.
    Examples:
      >>> fetch_data("machine learning", limit=50, year_range=(2010, 2020))
      [{...}, {...}, ...]
    """
    params = {
        "query": search_query,
        "limit": limit,
        "fields": "title,url,abstract,authors,citationStyles,journal,citationCount,year,externalIds",
    }

    if year_range is not None:
        params["year"] = year_range

    data = self.make_request("", params=params)
    return data.get("data", [])

`BaseWebAPIDataLoader`

Bases: ABC

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py

class BaseWebAPIDataLoader(ABC):
    def __init__(self, base_url):
        self.base_url = base_url

    @abstractmethod
    def fetch_data(self, search_query, **kwargs):
        """
        Fetches data from the API.
        Args:
          search_query (str): The search query to use.
          **kwargs: Additional keyword arguments to pass to the API.
        Returns:
          dict: The response from the API.
        Raises:
          NotImplementedError: If the method is not implemented.
        """
        pass

    def make_request(self, endpoint, params=None):
        """
        Makes a request to the API.
        Args:
          endpoint (str): The API endpoint to make the request to.
          params (dict, optional): Additional parameters to pass to the API. Defaults to None.
        Returns:
          dict: The response from the API.
        Raises:
          Exception: If the request fails.
        """
        url = f"{self.base_url}{endpoint}"
        response = requests.get(url, params=params)

        if response.status_code == 200:
            data = response.json()
            return data
        else:
            raise Exception(f"Failed to fetch data from API: {response.status_code}")

`fetch_data(search_query, **kwargs)` `abstractmethod`

Fetches data from the API.

Parameters:

Name	Type	Description	Default
`search_query`	`str`	The search query to use.	required
`**kwargs`		Additional keyword arguments to pass to the API.	`{}`

Returns:

Name	Type	Description
`dict`		The response from the API.

Raises:

Type	Description
`NotImplementedError`	If the method is not implemented.

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py

@abstractmethod
def fetch_data(self, search_query, **kwargs):
    """
    Fetches data from the API.
    Args:
      search_query (str): The search query to use.
      **kwargs: Additional keyword arguments to pass to the API.
    Returns:
      dict: The response from the API.
    Raises:
      NotImplementedError: If the method is not implemented.
    """
    pass

`make_request(endpoint, params=None)`

Makes a request to the API.

Parameters:

Name	Type	Description	Default
`endpoint`	`str`	The API endpoint to make the request to.	required
`params`	`dict`	Additional parameters to pass to the API. Defaults to None.	`None`

Returns:

Name	Type	Description
`dict`		The response from the API.

Raises:

Type	Description
`Exception`	If the request fails.

Source code in autoresearcher/data_sources/web_apis/base_web_api_data_loader.py

def make_request(self, endpoint, params=None):
    """
    Makes a request to the API.
    Args:
      endpoint (str): The API endpoint to make the request to.
      params (dict, optional): Additional parameters to pass to the API. Defaults to None.
    Returns:
      dict: The response from the API.
    Raises:
      Exception: If the request fails.
    """
    url = f"{self.base_url}{endpoint}"
    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        return data
    else:
        raise Exception(f"Failed to fetch data from API: {response.status_code}")

Data sources

WikipediaLoader

fetch_data(search_query, results=10, language='en')

SemanticScholarLoader

__init__()

fetch_and_sort_papers(search_query, limit=100, top_n=20, year_range=None, keyword_combinations=None, weight_similarity=0.5)

fetch_data(search_query, limit=100, year_range=None)

BaseWebAPIDataLoader

fetch_data(search_query, **kwargs) abstractmethod

make_request(endpoint, params=None)

`WikipediaLoader`

`fetch_data(search_query, results=10, language='en')`

`SemanticScholarLoader`

`init()`

`fetch_and_sort_papers(search_query, limit=100, top_n=20, year_range=None, keyword_combinations=None, weight_similarity=0.5)`

`fetch_data(search_query, limit=100, year_range=None)`

`BaseWebAPIDataLoader`

`fetch_data(search_query, **kwargs)` `abstractmethod`

`make_request(endpoint, params=None)`